From 187511909594737eb45eef37ad6010edad010534 Mon Sep 17 00:00:00 2001 From: WoosungMyung Date: Sat, 4 Jan 2025 08:47:46 +0900 Subject: [PATCH 01/21] Explanation of Function change --- src/bootstrap.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bootstrap.cc b/src/bootstrap.cc index c1d085e4c..e49514b90 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -70,7 +70,7 @@ static int localIdFromRoot(int rank, int root, int nRanks, int nRoots) { int ir = BOOTSTRAP_PID(root, nRoots); return rank - firstRankFromRoot(ir, nRanks, nRoots); } -// return the number of child for a root, root will be periodized +// Check if the given rank is the first rank from the root static int isFirstFromRoot(int rank, int root, int nRanks, int nRoots) { return (rank == firstRankFromRoot(root, nRanks, nRoots)); } From 6aae37927840b2bd7b7d42d2f0050e75f88ee97f Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Wed, 18 Dec 2024 08:26:06 -0800 Subject: [PATCH 02/21] 2.24.3-1 Network user buffer support for collectives * Leverage user buffer registration to achieve zero-copy inter-node communications for Ring, NVLS and Collnet Add RAS subsystem * Create a RAS thread keeping track of all NCCL communicators. * Add a ncclras tool contacting the RAS thread and getting a report. Add fp8 support * Add support for e5m2 and e4m3 8-bit floating point operations. * Use Tree/PAT algorithms when possible for better numerical stability. Add NIC fusion * Add a NET API to ask the network plugin to fuse a set of interfaces together. * Fuse multiple NICs under the same PCI switch as a single, larger NIC. Socket connection failure retry * Retry in case of socket connection failure (unreachable host) * Avoid "Software caused connection abort" errors on retries QP connection failure retry * Retry in case of IB QP connection failure during ibv_modify_qp. NET API improvements * Allow plugins to force a flush in case data and completion ordering is not guaranteed. * Indicate when completion is not needed (e.g. for the LL128 protocol), allowing plugins to skip generating a completion. * Allow for full offload of allgather operations when using one GPU per node. NCCL_ALGO/NCCL_PROTO strict enforcement * Extend NCCL_ALGO/NCCL_PROTO syntax to be able to specify ALGO/PROTO filters for each collective operation. * Strictly enforce the ALGO/PROTO filters, no longer fall back on the ring algorithm when the filtering leaves no option and error out instead. Enable CUMEM host allocations * Use cumem functions for host memory allocation by default. Improved profiler plugin API * Avoid dependencies with NCCL includes. * Add information on whether the buffer is registered or not Adjust PAT tuning * Improve transition between PAT and ring at scale. Fix hangs when running with different CPU architectures * Detect when we use a mix of GPU architectures * Ensure Algo/Proto decisions are made based on that unified state. Fix FD leak in UDS * Fix a leak when mapping buffers intra-node with cumem IPCs. Fix crash when mixing buffer registration and graph buffer registration. * Separate local and graph registration to avoid crashes when we free buffers. Fix user buffer registration with dmabuf * Make ncclSend/ncclRecv communication with buffer registration functional on network plugins relying on dmabuf for buffer registration. Fix crash in IB code caused by uninitialized fields. Fix non-blocking ncclSend/ncclRecv * Fix case where ncclSend/ncclRecv would return ncclSuccess in non-blocking mode even though the operation was not enqueued onto the stream. * Issue #1495 Various compiler tweaks and fixes * PR #758 Fix typo in ncclTopoPrintGraph * Issue #1468 --- ext-net/README.md | 61 +- ext-net/example/nccl/net.h | 3 + ext-net/example/nccl/net_device.h | 3 +- ext-net/example/nccl/net_v8.h | 2 - ext-net/example/nccl/net_v9.h | 99 ++ ext-net/example/plugin.c | 102 +- ext-profiler/example/event.h | 21 +- ext-profiler/example/nccl/profiler.h | 1 + ext-profiler/example/nccl/profiler_v1.h | 53 +- ext-profiler/example/nccl/profiler_v2.h | 146 ++ ext-profiler/example/plugin.c | 114 +- ext-profiler/example/print_event.c | 72 +- ext-tuner/example/nccl/tuner.h | 9 +- ext-tuner/example/plugin.c | 9 +- makefiles/common.mk | 5 + makefiles/version.mk | 4 +- src/Makefile | 30 +- src/bootstrap.cc | 103 +- src/collectives.cc | 36 +- src/debug.cc | 16 + src/device/all_gather.h | 145 +- src/device/all_reduce.h | 167 +-- src/device/broadcast.h | 52 +- src/device/common.h | 3 + src/device/common_kernel.h | 19 +- src/device/generate.py | 35 +- src/device/network/unpack/unpack.h | 4 + src/device/onerank.cu | 4 + src/device/primitives.h | 7 +- src/device/prims_ll.h | 6 +- src/device/prims_ll128.h | 6 +- src/device/prims_simple.h | 238 +-- src/device/reduce_kernel.h | 171 ++- src/device/reduce_scatter.h | 55 +- src/device/sendrecv.h | 18 +- src/enqueue.cc | 820 +++++------ src/graph/paths.cc | 71 +- src/graph/search.cc | 2 +- src/graph/topo.cc | 600 +++++++- src/graph/topo.h | 4 + src/graph/tuning.cc | 246 ++-- src/graph/xml.cc | 16 +- src/graph/xml.h | 27 +- src/group.cc | 22 +- src/include/collectives.h | 321 ++++- src/include/comm.h | 22 +- src/include/debug.h | 2 + src/include/device.h | 30 +- src/include/enqueue.h | 11 + src/include/graph.h | 11 +- src/include/ibvwrap.h | 12 + src/include/nccl_common.h | 1 + src/include/nccl_net.h | 168 ++- src/include/nccl_profiler.h | 121 +- src/include/nccl_tuner.h | 53 +- src/include/net_device.h | 3 +- src/include/nvmlwrap.h | 2 +- src/include/profiler.h | 8 +- src/include/proxy.h | 32 +- src/include/ras.h | 24 + src/include/register.h | 21 +- src/include/shmutils.h | 2 +- src/include/socket.h | 26 +- src/include/transport.h | 18 +- src/include/utils.h | 3 +- src/init.cc | 144 +- src/misc/cudawrap.cc | 37 +- src/misc/ibvwrap.cc | 94 +- src/misc/ipcsocket.cc | 18 +- src/misc/nvmlwrap.cc | 8 +- src/misc/profiler.cc | 220 ++- src/misc/shmutils.cc | 6 +- src/misc/socket.cc | 299 ++-- src/misc/tuner.cc | 57 +- src/nccl.h.in | 15 +- src/net.cc | 681 ++++++--- src/proxy.cc | 95 +- src/ras/client.cc | 318 ++++ src/ras/client_support.cc | 1755 +++++++++++++++++++++++ src/ras/collectives.cc | 762 ++++++++++ src/ras/peers.cc | 960 +++++++++++++ src/ras/ras.cc | 668 +++++++++ src/ras/ras_internal.h | 512 +++++++ src/ras/rasnet.cc | 1189 +++++++++++++++ src/register.cc | 204 --- src/register/coll_reg.cc | 446 ++++++ src/register/register.cc | 179 +++ src/register/sendrecv_reg.cc | 35 + src/transport.cc | 28 +- src/transport/coll_net.cc | 585 +++++--- src/transport/generic.cc | 22 +- src/transport/net.cc | 468 ++++-- src/transport/net_ib.cc | 552 ++++--- src/transport/net_socket.cc | 17 +- src/transport/nvls.cc | 287 ++-- src/transport/p2p.cc | 404 +++--- src/transport/shm.cc | 30 +- 97 files changed, 12537 insertions(+), 3076 deletions(-) create mode 100644 ext-net/example/nccl/net_v9.h create mode 100644 ext-profiler/example/nccl/profiler_v2.h create mode 100644 src/include/ras.h create mode 100644 src/ras/client.cc create mode 100644 src/ras/client_support.cc create mode 100644 src/ras/collectives.cc create mode 100644 src/ras/peers.cc create mode 100644 src/ras/ras.cc create mode 100644 src/ras/ras_internal.h create mode 100644 src/ras/rasnet.cc delete mode 100644 src/register.cc create mode 100644 src/register/coll_reg.cc create mode 100644 src/register/register.cc create mode 100644 src/register/sendrecv_reg.cc diff --git a/ext-net/README.md b/ext-net/README.md index 781fd904a..aa1a3945e 100644 --- a/ext-net/README.md +++ b/ext-net/README.md @@ -60,9 +60,9 @@ of newer ones. The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions from old API versions. It also provides error codes in `err.h`. -# API (v6) +# API (v9) -Below is the main `ncclNet_v6` struct. Each function is explained in later sections. +Below is the main `ncclNet_v9` struct. Each function is explained in later sections. ``` typedef struct { @@ -73,7 +73,7 @@ typedef struct { // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. @@ -82,24 +82,26 @@ typedef struct { // This call must not block for the connection to be established, and instead // should return successfully with sendComm == NULL with the expectation that // it will be called again until sendComm != NULL. - ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm); // Finalize connection establishment after remote peer has called connect. // This call must not block for the connection to be established, and instead // should return successfully with recvComm == NULL with the expectation that // it will be called again until recvComm != NULL. - ncclResult_t (*accept)(void* listenComm, void** recvComm); + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); /* DMA-BUF support */ ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); @@ -110,7 +112,17 @@ typedef struct { ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); -} ncclNet_v6_t; + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); + + // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller + // what index this new vNIC exists at + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props); +} ncclNet_t; ``` ## Error codes @@ -136,11 +148,19 @@ not need to rely on CUDA, this should not be common. NCCL will call the `init` function first, then query the number of network devices with the `devices` function, getting each network device properties with `getProperties`. +If NCCL wishes to initialize virtual devices, used in NIC fusion currently, it can call `makeVDevice` +specifying a list of physical devices (the original devices listed from `devices`) it wishes to +merge together. If the plugin does not support NIC fusion, it can set `makeVDevice` to null. + To establish a connection between two network devices, NCCL will first call `listen` on the receiving side, pass the returned handle to the sender side of the connection, and call `connect` with that handle. Finally, `accept` will be called on the receiving side to finalize the connection establishment. +`connect` and `accept` can receive an optional `netDevComm` pointer from the caller, if the caller +wishes to make use of device networking. This parameter may be ignored by the plugin if it does +not support device-side networking. + Once the connection is established, communication will be done using the functions `isend`, `irecv` and `test`. Prior to calling `isend` or `irecv`, NCCL will call the `regMr` function on all buffers to allow RDMA NICs to prepare buffers. `deregMr` will be used to unregister buffers. @@ -219,6 +239,12 @@ different offset within the original buffer, with a smaller size, etc), then der The call to ncclCommDeregister should call the final deregMr() and effectively remove the mapping on the network adapter. +The `forceFlush` field can request the NCCL core to call flush for all transfers. By default, +flushes are only called when the GPU architecture or PCI topology would not not guarantee correct +PCI ordering. Plugins can set it to one if the NIC operates in a mode where e.g. the data and the +completion paths use different PCI links and therefore need a call to flush() to guarantee +ordering. + The `speed` field indicates the speed of the network port in Mbps (10^6 bits per second). This is important to ensure proper optimization of flows within the node. @@ -234,6 +260,17 @@ The `maxComms` field indicates the maximum number of connections we can create. The `maxRecvs` field indicates the maximum number for grouped receive operations (see grouped receive). +The `netDeviceType` indicates which type of device networking this plugin supports. The current supported +options are `NCCL_NET_DEVICE_HOST` and `NCCL_NET_DEVICE_UNPACK`. + +The `netDeviceVersion` indicates the version of device networking this plugin supports. Currently, this must match the associated netDeviceVersion of this netDeviceType compiled into NCCL core. Net device functionality is built as apart of NCCL core's device code. + +The `maxP2pBytes` and `maxCollBytes` fields indicate the maximum size the plugin can handle for +point-to-point and collective calls. This will tell the NCCL core to cut large operations into +multiple smaller chunks if needed. + +`vProps` is the list of devices that have been fused into the current device. Each entry is an index pointing to the child device. + ### Connection establishment Connections are used in an unidirectional manner. There is therefore a sender side and a receiver @@ -332,6 +369,12 @@ handled by a single request handle. The sizes provided to `irecv` can (and will) be larger than the size of the `isend` operation. The contrary (receive size being lower than the send size) is an error, however. +NCCL sets request pointer in `irecv` to `NCCL_NET_OPTIONAL_RECV_COMPLETION` when it is using +LL or LL128 protocols. In these cases, NCCL polls on flag embedded in data to detect completion +of irecv and is resilient to redundant network writes. This allows the plugin to optimize request +completions on such irecvs (for example, complete the request immediately). The plugin is still +expected to set a valid request pointer on return which NCCL can poll to check for completion. + Note: for a given connection, send/receive operations should always match in the order they were posted. Tags provided for receive operations are only used to assign a given send operation to one of the buffers of the first (multi-)receive in the queue, not to allow for out-of-order tag diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h index 2aea8c439..112967ab8 100644 --- a/ext-net/example/nccl/net.h +++ b/ext-net/example/nccl/net.h @@ -12,6 +12,8 @@ #include "err.h" #define NCCL_NET_HANDLE_MAXSIZE 128 +#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB +#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1 #define NCCL_PTR_HOST 0x1 #define NCCL_PTR_CUDA 0x2 @@ -20,6 +22,7 @@ // Maximum number of requests per comm object #define NCCL_NET_MAX_REQUESTS 32 +#include "net_v9.h" #include "net_v8.h" #include "net_v7.h" #include "net_v6.h" diff --git a/ext-net/example/nccl/net_device.h b/ext-net/example/nccl/net_device.h index b430d9064..874fb5999 100644 --- a/ext-net/example/nccl/net_device.h +++ b/ext-net/example/nccl/net_device.h @@ -25,6 +25,7 @@ typedef struct { } ncclNetDeviceHandle_v7_t; typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t; -typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t; +typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t; +typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t; #endif diff --git a/ext-net/example/nccl/net_v8.h b/ext-net/example/nccl/net_v8.h index 316155820..54a61f61b 100644 --- a/ext-net/example/nccl/net_v8.h +++ b/ext-net/example/nccl/net_v8.h @@ -23,8 +23,6 @@ typedef struct { int netDeviceVersion; // Version number for network offload } ncclNetProperties_v8_t; -typedef ncclNetProperties_v8_t ncclNetProperties_t; - typedef struct { // Name of the network (mainly for logs) const char* name; diff --git a/ext-net/example/nccl/net_v9.h b/ext-net/example/nccl/net_v9.h new file mode 100644 index 000000000..61035ecc9 --- /dev/null +++ b/ext-net/example/nccl/net_v9.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NCCL_NET_V9_H_ +#define NCCL_NET_V9_H_ + +#include "net_device.h" + +#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4 +#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9 +typedef struct { + int ndevs; + int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9]; +} ncclNetVDeviceProps_v9_t; +typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t; + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int regIsGlobal; // regMr is not tied to a particular comm + int forceFlush; // Force a flush on receives + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload + ncclNetVDeviceProps_v9_t vProps; + size_t maxP2pBytes; // Max transfer size for point-to-point operations + size_t maxCollBytes; // Max transfer size for collective operations +} ncclNetProperties_v9_t; + +typedef ncclNetProperties_v9_t ncclNetProperties_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v9_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v9_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); + + // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller + // what index this new vNIC exists at + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props); +} ncclNet_v9_t; + +#endif // end include guard diff --git a/ext-net/example/plugin.c b/ext-net/example/plugin.c index 128dde9b4..285224261 100644 --- a/ext-net/example/plugin.c +++ b/ext-net/example/plugin.c @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2024, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,15 +7,15 @@ #include "net.h" #define __hidden __attribute__ ((visibility("hidden"))) +#define NCCL_PLUGIN_MAX_RECVS 1 int max_requests = NCCL_NET_MAX_REQUESTS; __hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; } __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; } - __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; } __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; } -__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v8_t* props) { +__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) { // Below are default values, if unsure don't change. props->name = "Example"; @@ -27,6 +27,8 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v8_t* props props->ptrSupport = NCCL_PTR_HOST; // If you regMr has a fast registration cache, set to 1. If set to 0, user buffer registration may be disabled. props->regIsGlobal = 0; + // Force flush after receive. Needed if the control path and data path use a different path to the GPU + props->forceFlush = 0; // Speed in *Mbps*. 100000 means 100G props->speed = 100000; // Port number, used in conjunction with guid @@ -36,20 +38,27 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v8_t* props // Maximum number of comm objects we can create. props->maxComms = 1024*1024; // Maximum number of receive operations taken by irecv(). - props->maxRecvs = 1; + props->maxRecvs = NCCL_PLUGIN_MAX_RECVS; // Coupling with NCCL network device-side code. - props->netDeviceType = 0; + props->netDeviceType = NCCL_NET_DEVICE_HOST; props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; - return ncclInternalError; + // Used to tell NCCL core whether this is a virtual device fusing multiple physical devices. + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + // maximum transfer sizes the plugin can handle + props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES; + props->maxCollBytes = NCCL_MAX_NET_SIZE_BYTES; + return ncclSuccess; } + __hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; } -__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm) { return ncclInternalError; } -__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm) { return ncclInternalError; } +__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; } +__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; } __hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; } __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; } __hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;} -__hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { return ncclInternalError; } -__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { return ncclInternalError; } +__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { return ncclInternalError; } +__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { return ncclInternalError; } __hidden ncclResult_t pluginIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { return ncclInternalError; } __hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; } __hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; } @@ -57,10 +66,11 @@ __hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; } __hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; } __hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; } +__hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { return ncclInternalError; } #define PLUGIN_NAME "Plugin" -const ncclNet_v8_t ncclNetPlugin_v8 = { +ncclNet_v9_t ncclNetPlugin_v9 = { .name = PLUGIN_NAME, .init = pluginInit, .devices = pluginDevices, @@ -80,8 +90,60 @@ const ncclNet_v8_t ncclNetPlugin_v8 = { .closeListen = pluginCloseListen, .getDeviceMr = pluginGetDeviceMr, .irecvConsumed = pluginIrecvConsumed, + .makeVDevice = pluginMakeVDevice, }; +__hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* props_v8) { + ncclNetProperties_t props; + ncclResult_t ret = pluginGetProperties(dev, &props); + if (ret != ncclSuccess) return ret; + props_v8->name = props.name; + props_v8->pciPath = props.pciPath; + props_v8->guid = props.guid; + props_v8->ptrSupport = props.ptrSupport; + props_v8->regIsGlobal = props.regIsGlobal; + props_v8->speed = props.speed; + props_v8->latency = props.latency; + props_v8->port = props.port; + props_v8->maxComms = props.maxComms; + props_v8->maxRecvs = props.maxRecvs; + props_v8->netDeviceType = props.netDeviceType; + props_v8->netDeviceVersion = props.netDeviceVersion; + return ncclSuccess; +} + +__hidden ncclResult_t pluginIsend_v8(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { + return pluginIsend(sendComm, data, (int)size, tag, mhandle, request); +} + +__hidden ncclResult_t pluginIrecv_v8(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { + size_t sizesOut[NCCL_PLUGIN_MAX_RECVS]; + for (int i=0; iguid = props.guid; props_v7->ptrSupport = props.ptrSupport; props_v7->speed = props.speed; + props_v7->latency = props.latency; props_v7->port = props.port; props_v7->maxComms = props.maxComms; props_v7->maxRecvs = props.maxRecvs; @@ -114,8 +177,8 @@ const ncclNet_v7_t ncclNetPlugin_v7 = { .regMr = pluginRegMr_v7, .regMrDmaBuf = pluginRegMrDmaBuf, .deregMr = pluginDeregMr, - .isend = pluginIsend, - .irecv = pluginIrecv, + .isend = pluginIsend_v8, + .irecv = pluginIrecv_v8, .iflush = pluginIflush, .test = pluginTest, .closeSend = pluginCloseSend, @@ -134,6 +197,7 @@ __hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* pr props_v6->guid = props.guid; props_v6->ptrSupport = props.ptrSupport; props_v6->speed = props.speed; + props_v6->latency = props.latency; props_v6->port = props.port; props_v6->maxComms = props.maxComms; props_v6->maxRecvs = props.maxRecvs; @@ -154,8 +218,8 @@ const ncclNet_v6_t ncclNetPlugin_v6 = { .regMr = pluginRegMr_v7, .regMrDmaBuf = pluginRegMrDmaBuf, .deregMr = pluginDeregMr, - .isend = pluginIsend, - .irecv = pluginIrecv, + .isend = pluginIsend_v8, + .irecv = pluginIrecv_v8, .iflush = pluginIflush, .test = pluginTest, .closeSend = pluginCloseSend, @@ -174,8 +238,8 @@ const ncclNet_v5_t ncclNetPlugin_v5 = { .accept = pluginAccept_v6, .regMr = pluginRegMr_v7, .deregMr = pluginDeregMr, - .isend = pluginIsend, - .irecv = pluginIrecv, + .isend = pluginIsend_v8, + .irecv = pluginIrecv_v8, .iflush = pluginIflush, .test = pluginTest, .closeSend = pluginCloseSend, @@ -198,11 +262,11 @@ static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* prop return ncclSuccess; } static ncclResult_t pluginIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) { - return pluginIsend(sendComm, data, size, 0, mhandle, request); + return pluginIsend_v8(sendComm, data, size, 0, mhandle, request); } static ncclResult_t pluginIrecv_v4(void* recvComm, void* data, int size, void* mhandle, void** request) { int tag = 0; - return pluginIrecv(recvComm, 1, &data, &size, &tag, &mhandle, request); + return pluginIrecv_v8(recvComm, 1, &data, &size, &tag, &mhandle, request); } static ncclResult_t pluginIflush_v4(void* recvComm, void* data, int size, void* mhandle, void** request) { return pluginIflush(recvComm, 1, &data, &size, &mhandle, request); diff --git a/ext-profiler/example/event.h b/ext-profiler/example/event.h index 743280813..1486a2248 100644 --- a/ext-profiler/example/event.h +++ b/ext-profiler/example/event.h @@ -14,6 +14,7 @@ #define MAX_CHANNELS 32 #define MAX_STEPS 16 +#define MAX_OPS 16 // Up to 64K ranks for PAT #define PROXY_OP_SEND_STATE_OFFSET (ncclProfilerProxyOpSendPosted) #define PROXY_OP_RECV_STATE_OFFSET (ncclProfilerProxyOpRecvPosted) @@ -86,7 +87,7 @@ struct taskEventBase { int rank; // rank of the operation in NCCL communicator const char* name; // FIXME: unused uint64_t commHash; // communicator identifier - uint8_t func; // ncclFunc* + const char* func; // ncclFunc* int refCount; // number of references for this operation struct group* parent; // parent event group struct taskEventBase* next; // next top level event in group @@ -102,16 +103,14 @@ struct collective { size_t count; size_t trafficBytes; int root; - uint8_t datatype; + const char* datatype; uint8_t nMaxChannels; - uint8_t algo; - uint8_t proto; - int op; + const char* algo; + const char* proto; int nWarps; - int isCollnet; - int isNvls; - struct proxyOp send[MAX_CHANNELS];// array of send proxy operation events - struct proxyOp recv[MAX_CHANNELS];// array of recv proxy operation events + struct proxyOp send[MAX_CHANNELS][MAX_OPS];// array of send proxy operation events + struct proxyOp recv[MAX_CHANNELS][MAX_OPS];// array of recv proxy operation events + int nProxyOps[MAX_CHANNELS]; }; struct p2p { @@ -119,9 +118,9 @@ struct p2p { uint8_t func; void const* buff; size_t count; - uint8_t datatype; + const char* datatype; int peer; - struct proxyOp op; + struct proxyOp op[MAX_CHANNELS]; }; struct group { diff --git a/ext-profiler/example/nccl/profiler.h b/ext-profiler/example/nccl/profiler.h index db7bc3fea..6680cfece 100644 --- a/ext-profiler/example/nccl/profiler.h +++ b/ext-profiler/example/nccl/profiler.h @@ -13,6 +13,7 @@ #include "common.h" #include "err.h" +#include "profiler_v2.h" #include "profiler_v1.h" #endif // end include guard diff --git a/ext-profiler/example/nccl/profiler_v1.h b/ext-profiler/example/nccl/profiler_v1.h index 8724a1c66..7d34bed57 100644 --- a/ext-profiler/example/nccl/profiler_v1.h +++ b/ext-profiler/example/nccl/profiler_v1.h @@ -9,16 +9,6 @@ #include -enum { - ncclProfileGroup = (1 << 0), // group event type - ncclProfileColl = (1 << 1), // host collective call event type - ncclProfileP2p = (1 << 2), // host point-to-point call event type - ncclProfileProxyOp = (1 << 3), // proxy operation event type - ncclProfileProxyStep = (1 << 4), // proxy step event type - ncclProfileProxyCtrl = (1 << 5), // proxy control event type - ncclProfileNumEvents = ( 6), -}; - typedef struct { uint8_t type; // event type descriptor: ncclProfileColl, ... void* parentObj; // pointer to the profiler parent object (for coll is the group) @@ -69,42 +59,8 @@ typedef struct { }; } ncclProfilerEventDescr_v1_t; -typedef enum { - ncclProfilerProxyOpSendPosted, - ncclProfilerProxyOpSendRemFifoWait, - ncclProfilerProxyOpSendTransmitted, - ncclProfilerProxyOpSendDone, - ncclProfilerProxyOpRecvPosted, - ncclProfilerProxyOpRecvReceived, - ncclProfilerProxyOpRecvTransmitted, - ncclProfilerProxyOpRecvDone, - - /* Legacy proxy profiler states */ - ncclProfilerProxyStepSendGPUWait, - ncclProfilerProxyStepSendWait, - ncclProfilerProxyStepRecvWait, - ncclProfilerProxyStepRecvFlushWait, - ncclProfilerProxyStepRecvGPUWait, - - /* Legacy proxy control states */ - ncclProfilerProxyCtrlIdle, - ncclProfilerProxyCtrlActive, - ncclProfilerProxyCtrlSleep, - ncclProfilerProxyCtrlWakeup, - ncclProfilerProxyCtrlAppend, - ncclProfilerProxyCtrlAppendEnd, -} ncclProfilerEventState_v1_t; - -typedef union { - struct { - size_t transSize; - int steps; - } proxyOp; - - struct { - int appendedProxyOps; - } proxyCtrl; -} ncclProfilerEventStateArgs_v1_t; +typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t; +typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t; typedef struct { const char* name; @@ -142,9 +98,4 @@ typedef struct { ncclResult_t (*finalize)(void* context); } ncclProfiler_v1_t; -typedef ncclProfilerEventDescr_v1_t ncclProfilerEventDescr_t; -typedef ncclProfilerEventState_v1_t ncclProfilerEventState_t; -typedef ncclProfilerEventStateArgs_v1_t ncclProfilerEventStateArgs_t; -typedef ncclProfiler_v1_t ncclProfiler_t; - #endif diff --git a/ext-profiler/example/nccl/profiler_v2.h b/ext-profiler/example/nccl/profiler_v2.h new file mode 100644 index 000000000..aab4ccf86 --- /dev/null +++ b/ext-profiler/example/nccl/profiler_v2.h @@ -0,0 +1,146 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_PROFILER_V2_H_ +#define NCCL_PROFILER_V2_H_ + +#include + +enum { + ncclProfileGroup = (1 << 0), // group event type + ncclProfileColl = (1 << 1), // host collective call event type + ncclProfileP2p = (1 << 2), // host point-to-point call event type + ncclProfileProxyOp = (1 << 3), // proxy operation event type + ncclProfileProxyStep = (1 << 4), // proxy step event type + ncclProfileProxyCtrl = (1 << 5), // proxy control event type +}; + +typedef struct { + uint8_t type; // event type descriptor: ncclProfileColl, ... + void* parentObj; // pointer to the profiler parent object (for coll is the group) + int rank; // originating rank + union { + struct { + const char* name; + uint64_t commHash; + uint64_t seqNumber; + const char* func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + const char* datatype; + size_t trafficBytes; + uint8_t nMaxChannels; + uint8_t nWarps; + const char* algo; + const char* proto; + } coll; + + struct { + const char* name; + uint64_t commHash; + const char* func; + void* buff; + const char* datatype; + size_t count; + int peer; + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + }; +} ncclProfilerEventDescr_v2_t; + +typedef enum { + ncclProfilerProxyOpSendPosted, + ncclProfilerProxyOpSendRemFifoWait, + ncclProfilerProxyOpSendTransmitted, + ncclProfilerProxyOpSendDone, + ncclProfilerProxyOpRecvPosted, + ncclProfilerProxyOpRecvReceived, + ncclProfilerProxyOpRecvTransmitted, + ncclProfilerProxyOpRecvDone, + + /* Legacy proxy profiler states */ + ncclProfilerProxyStepSendGPUWait, + ncclProfilerProxyStepSendWait, + ncclProfilerProxyStepRecvWait, + ncclProfilerProxyStepRecvFlushWait, + ncclProfilerProxyStepRecvGPUWait, + + /* Legacy proxy control states */ + ncclProfilerProxyCtrlIdle, + ncclProfilerProxyCtrlActive, + ncclProfilerProxyCtrlSleep, + ncclProfilerProxyCtrlWakeup, + ncclProfilerProxyCtrlAppend, + ncclProfilerProxyCtrlAppendEnd, +} ncclProfilerEventState_v2_t; + +typedef union { + struct { + size_t transSize; + int steps; + } proxyOp; + + struct { + int appendedProxyOps; + } proxyCtrl; +} ncclProfilerEventStateArgs_v2_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, int* eActivationMask); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v2_t; + +typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t; +typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t; +typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t; +typedef ncclProfiler_v2_t ncclProfiler_t; + +#endif diff --git a/ext-profiler/example/plugin.c b/ext-profiler/example/plugin.c index f9de60813..64d5d8be1 100644 --- a/ext-profiler/example/plugin.c +++ b/ext-profiler/example/plugin.c @@ -21,11 +21,18 @@ static int initialized; // initialization counter for profiler static double startTime; // profiler start time -static int groupPoolSize = 16; -static int collPoolSize = 16; -static int p2pPoolSize = 1024; -static int proxyCtrlPoolSize = 16; -static int detachPoolSize = 128; +static const int defaultEActivationMask = ncclProfileColl | ncclProfileP2p; +static const int defaultGroupPoolSize = 16; +static const int defaultCollPoolSize = 16; +static const int defaultP2pPoolSize = 1024; +static const int defaultProxyCtrlPoolSize = 16; +static const int defaultDetachPoolSize = 128; + +static int groupPoolSize; +static int collPoolSize; +static int p2pPoolSize; +static int proxyCtrlPoolSize; +static int detachPoolSize; static int detachPoolBase; static int detachPoolIndex; static int detachPoolDone; @@ -56,25 +63,25 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) pthread_mutex_lock(&lock); if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) { // first thread initializes event mask, environment and detach pool - __atomic_store_n(eActivationMask, ncclProfileColl | ncclProfileP2p, __ATOMIC_RELAXED); - if (getenv("NCCL_PROFILE_EVENT_MASK")) { - __atomic_store_n(eActivationMask, atoi(getenv("NCCL_PROFILE_EVENT_MASK")), __ATOMIC_RELAXED); - } - if (getenv("NCCL_PROFILE_GROUP_POOL_SIZE")) { - groupPoolSize = atoi(getenv("NCCL_PROFILE_GROUP_POOL_SIZE")); - } - if (getenv("NCCL_PROFILE_COLL_POOL_SIZE")) { - collPoolSize = atoi(getenv("NCCL_PROFILE_COLL_POOL_SIZE")); - } - if (getenv("NCCL_PROFILE_P2P_POOL_SIZE")) { - p2pPoolSize = atoi(getenv("NCCL_PROFILE_P2P_POOL_SIZE")); - } - if (getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE")) { - proxyCtrlPoolSize = atoi(getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE")); - } - if (getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE")) { - detachPoolSize = atoi(getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE")); - } + const char* str; + str = getenv("NCCL_PROFILE_EVENT_MASK"); + __atomic_store_n(eActivationMask, str ? atoi(str) : defaultEActivationMask, __ATOMIC_RELAXED); + + str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE"); + groupPoolSize = str ? atoi(str) : defaultGroupPoolSize; + + str = getenv("NCCL_PROFILE_COLL_POOL_SIZE"); + collPoolSize = str ? atoi(str) : defaultCollPoolSize; + + str = getenv("NCCL_PROFILE_P2P_POOL_SIZE"); + p2pPoolSize = str ? atoi(str) : defaultP2pPoolSize; + + str = getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE"); + proxyCtrlPoolSize = str ? atoi(str) : defaultProxyCtrlPoolSize; + + str = getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE"); + detachPoolSize = str ? atoi(str) : defaultDetachPoolSize; + // detach pool is used to store PXN proxyOps and is shared among threads detachPool = (struct proxyOp *)calloc(detachPoolSize, sizeof(*detachPool)); if (detachPool == NULL) { @@ -107,6 +114,13 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) ctx->proxyCtrlPool = (struct proxyCtrl *)calloc(proxyCtrlPoolSize, sizeof(*ctx->proxyCtrlPool)); if (ctx->proxyCtrlPool == NULL) goto fail; + // Print event pool sizes for debugging + //fprintf(stdout, "Profiler: Group pool size (bytes): %lu\n", sizeof(struct group)*groupPoolSize); + //fprintf(stdout, "Profiler: Coll pool size (bytes): %lu\n", sizeof(struct collective)*collPoolSize); + //fprintf(stdout, "Profiler: P2p pool size (bytes): %lu\n", sizeof(struct p2p)*p2pPoolSize); + //fprintf(stdout, "Profiler: Proxy pool size (bytes): %lu\n", sizeof(struct proxyCtrl)*proxyCtrlPoolSize); + //fprintf(stdout, "Profiler: PXN pool size (bytes): %lu\n", sizeof(struct proxyOp)*detachPoolSize); + *context = ctx; return ncclSuccess; @@ -154,7 +168,7 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) { free(ctx); // last thread cleans up shared detach pool - if (__atomic_fetch_sub(&initialized, 1, __ATOMIC_RELAXED) - 1 == 0) { + if (__atomic_sub_fetch(&initialized, 1, __ATOMIC_RELAXED) == 0) { start = (detachPoolIndex - detachPoolSize >= 0) ? detachPoolIndex - detachPoolSize : 0; end = detachPoolIndex; for (int i = start; i < end; i++) { @@ -171,7 +185,7 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) { __hidden void updateEvent(void* handle); -__hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr) { +__hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) { *eHandle = NULL; struct context* ctx = (struct context *)context; if (eDescr->type == ncclProfileGroup) { @@ -185,14 +199,15 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n if (base->type == ncclProfileColl) { struct collective* c = (struct collective *)base; // reset event proxyOps & proxySteps - memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS); - memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS); + memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS); + memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS); + memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS); // release collective events in the group and return them to the collective pool __atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED); } else if (base->type == ncclProfileP2p) { struct p2p* p = (struct p2p *)base; // reset event proxyOp and proxySteps - memset(&p->op, 0, sizeof(struct proxyOp)); + memset(&p->op, 0, sizeof(struct proxyOp)*MAX_CHANNELS); // release p2p events in the group and return them to the p2p pool __atomic_fetch_add(&ctx->p2pPoolBase, 1, __ATOMIC_RELAXED); } @@ -203,7 +218,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n return ncclSuccess; } event->type = ncclProfileGroup; - __atomic_store_n(&event->refCount, 1, __ATOMIC_RELAXED); event->ctx = ctx; event->groupId = groupId; event->startTs = gettime() - startTime; @@ -238,14 +252,11 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n event->count = eDescr->coll.count; event->root = eDescr->coll.root; event->datatype = eDescr->coll.datatype; - event->op = eDescr->coll.op; event->trafficBytes = eDescr->coll.trafficBytes; event->nMaxChannels = eDescr->coll.nMaxChannels; event->nWarps = eDescr->coll.nWarps; event->algo = eDescr->coll.algo; event->proto = eDescr->coll.proto; - event->isCollnet = eDescr->coll.isCollnet; - event->isNvls = eDescr->coll.isNvls; *eHandle = event; taskEventQueueEnqueue(parent, (struct taskEventBase *)event); // increment the group ref counter so the event will staty open @@ -326,9 +337,13 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n if (eventBase->type == ncclProfileColl) { struct collective* parent = (struct collective *)eDescr->parentObj; - struct proxyOp* event = (eDescr->proxyOp.isSend) ? &parent->send[eDescr->proxyOp.channelId] : &parent->recv[eDescr->proxyOp.channelId]; + int channelId = eDescr->proxyOp.channelId; + struct proxyOp* event = (eDescr->proxyOp.isSend) ? + &parent->send[channelId][parent->nProxyOps[channelId]++] : + &parent->recv[channelId][parent->nProxyOps[channelId]++]; + event->type = ncclProfileProxyOp; - event->channelId = eDescr->proxyOp.channelId; + event->channelId = channelId; event->pid = eDescr->proxyOp.pid; event->rank = eDescr->rank; event->peer = eDescr->proxyOp.peer; @@ -338,13 +353,14 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n event->parent = eventBase; event->startTs = gettime() - startTime; *eHandle = event; - __atomic_store_n(&parent->base.refCount, 1, __ATOMIC_RELAXED); + __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED); debugEvent(event, "ProxyOpStart"); } else { // ncclProfileP2p struct p2p* parent = (struct p2p *)eDescr->parentObj; - struct proxyOp* event = &parent->op; + int channelId = eDescr->proxyOp.channelId; + struct proxyOp* event = &parent->op[channelId]; event->type = ncclProfileProxyOp; - event->channelId = eDescr->proxyOp.channelId; + event->channelId = channelId; event->pid = eDescr->proxyOp.pid; event->rank = eDescr->rank; event->peer = eDescr->proxyOp.peer; @@ -354,7 +370,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n event->parent = eventBase; event->startTs = gettime() - startTime; *eHandle = event; - __atomic_store_n(&parent->base.refCount, 1, __ATOMIC_RELAXED); + __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED); debugEvent(event, "ProxyOpStart"); } } else if (eDescr->type == ncclProfileProxyStep) { @@ -379,7 +395,7 @@ void updateEvent(void* handle) { uint8_t type = *(uint8_t *)handle; if (type == ncclProfileGroup) { struct group* event = (struct group *)handle; - if (__atomic_fetch_sub(&event->refCount, 1, __ATOMIC_RELAXED) == 1) { + if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) { event->stopTs = gettime() - startTime; // return group event to the pool __atomic_fetch_add(&event->ctx->groupPoolBase, 1, __ATOMIC_RELAXED); @@ -387,7 +403,7 @@ void updateEvent(void* handle) { debugEvent(event, "GroupStop"); } else if (type == ncclProfileColl) { struct collective* event = (struct collective *)handle; - if (__atomic_fetch_sub(&event->base.refCount, 1, __ATOMIC_RELAXED) == 1) { + if (__atomic_sub_fetch(&event->base.refCount, 1, __ATOMIC_RELAXED) == 0) { event->base.stopTs = gettime() - startTime; debugEvent(event, "CollStop"); updateEvent(event->base.parent); @@ -396,7 +412,7 @@ void updateEvent(void* handle) { debugEvent(event, "CollStop"); } else if (type == ncclProfileP2p) { struct p2p* event = (struct p2p *)handle; - if (__atomic_fetch_sub(&event->base.refCount, 1, __ATOMIC_RELAXED) == 1) { + if (__atomic_sub_fetch(&event->base.refCount, 1, __ATOMIC_RELAXED) == 0) { event->base.stopTs = gettime() - startTime; debugEvent(event, "P2pStop"); updateEvent(event->base.parent); @@ -408,7 +424,7 @@ void updateEvent(void* handle) { event->stopTs = gettime() - startTime; if (event->pid != pid) { // only for proxyOps that don't have a parent collective/p2p (i.e., PXN) - int done = __atomic_fetch_add(&detachPoolDone, 1, __ATOMIC_RELAXED) + 1; + int done = __atomic_add_fetch(&detachPoolDone, 1, __ATOMIC_RELAXED); if (done == detachPoolSize) { // reset the event completed (done) counter __atomic_store_n(&detachPoolDone, 0, __ATOMIC_RELAXED); @@ -451,12 +467,20 @@ __hidden ncclResult_t exampleProfilerStopEvent(void* eHandle) { struct collective* event = (struct collective *)eHandle; event->base.stopTs = gettime() - startTime; return ncclSuccess; + } else if (type == ncclProfileP2p) { + // stopping the p2p event in NCCL core does not + // mean the p2p has completed. It means the p2p + // was submitted/enqueued so we need to keep the event open + struct p2p* event = (struct p2p *)eHandle; + event->base.stopTs = gettime() - startTime; + return ncclSuccess; } + updateEvent(eHandle); return ncclSuccess; } -__hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs) { +__hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) { // the event handle might be null if we run out of events if (eHandle == NULL) return ncclSuccess; @@ -482,7 +506,7 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile return ncclSuccess; } -ncclProfiler_v1_t ncclProfiler_v1 = { +ncclProfiler_t ncclProfiler_v2 = { "Example-profiler", exampleProfilerInit, exampleProfilerStartEvent, diff --git a/ext-profiler/example/print_event.c b/ext-profiler/example/print_event.c index 490ba7ce4..f26a9eeb2 100644 --- a/ext-profiler/example/print_event.c +++ b/ext-profiler/example/print_event.c @@ -11,56 +11,6 @@ #define __hidden __attribute__ ((visibility("hidden"))) -__hidden const char* ncclFuncToString(int func) { - switch(func) { - case 0: - return "ncclBroadcast"; - case 1: - return "ncclReduce"; - case 2: - return "ncclAllGather"; - case 3: - return "ncclReduceScatter"; - case 4: - return "ncclAllReduce"; - case 5: - return "ncclSendRecv"; - case 6: - return "ncclSend"; - case 7: - return "ncclRecv"; - } - return NULL; -} - -__hidden const char* ncclAlgoToString(int algo) { - switch(algo) { - case 0: - return "Tree"; - case 1: - return "Ring"; - case 2: - return "CollnetDirect"; - case 3: - return "CollnetChain"; - case 4: - return "Nvls"; - case 5: - return "NvlsTree"; - } -} - -__hidden const char* ncclProtoToString(int proto) { - switch(proto) { - case 0: - return "LL"; - case 1: - return "LL128"; - case 2: - return "Simple"; - } -} - // FIXME: chrome tracing asynchronous events (following used) allow event nesting for events that have same id and category // It appears that nesting more than three events causes issues. Therefore, every event is given an increasing id and a // category that matches the type of event (GROUP, COLL, P2P, PROXY, NET) @@ -77,24 +27,24 @@ __hidden void printGroupEventTrailer(FILE* fh, struct group* event) { static __thread int collId; __hidden void printCollEventHeader(FILE* fh, struct collective* event) { - fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": %d, \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nMaxChannels\": %d}},\n", - ncclFuncToString(event->base.func), collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.commHash, event->base.rank, event->count, event->datatype, ncclAlgoToString(event->algo), ncclProtoToString(event->proto), event->nMaxChannels); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nMaxChannels\": %d}},\n", + event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nMaxChannels); } __hidden void printCollEventTrailer(FILE* fh, struct collective* event) { fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", - ncclFuncToString(event->base.func), collId++, getpid(), 1, event->base.stopTs); + event->base.func, collId++, getpid(), 1, event->base.stopTs); } static __thread int p2pId; __hidden void printP2pEventHeader(FILE* fh, struct p2p* event) { - fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": %d}},\n", - ncclFuncToString(event->base.func), p2pId, getpid(), 1, event->base.startTs, event->base.commHash, event->base.rank, event->peer, event->count, event->datatype); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": \"%s\"}},\n", + event->base.func, p2pId, getpid(), 1, event->base.startTs, event->base.commHash, event->base.rank, event->peer, event->count, event->datatype); } __hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) { fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", - ncclFuncToString(event->base.func), p2pId++, getpid(), 1, event->base.stopTs); + event->base.func, p2pId++, getpid(), 1, event->base.stopTs); } static __thread int proxyOpId; @@ -250,14 +200,18 @@ void printEvent(FILE* fh, void* handle) { struct collective* c = (struct collective *)handle; printCollEventHeader(fh, c); for (int i = 0; i < MAX_CHANNELS; i++) { - printEvent(fh, &c->send[i]); - printEvent(fh, &c->recv[i]); + for (int j = 0; j < c->nProxyOps[i]; j++) { + printEvent(fh, &c->send[i][j]); + printEvent(fh, &c->recv[i][j]); + } } printCollEventTrailer(fh, c); } else if (type == ncclProfileP2p) { struct p2p* p = (struct p2p *)handle; printP2pEventHeader(fh, p); - printEvent(fh, &p->op); + for (int i = 0; i < MAX_CHANNELS; i++) { + printEvent(fh, &p->op[i]); + } printP2pEventTrailer(fh, p); } else if (type == ncclProfileProxyOp) { struct proxyOp* p = (struct proxyOp *)handle; diff --git a/ext-tuner/example/nccl/tuner.h b/ext-tuner/example/nccl/tuner.h index aafabd72d..77b543d12 100644 --- a/ext-tuner/example/nccl/tuner.h +++ b/ext-tuner/example/nccl/tuner.h @@ -67,6 +67,7 @@ typedef struct { // - numPipeOps: number of operations in the group // - numAlgo: number of algorithms in collCostTable // - numProto: number of protocols in collCostTable + // - regBuff: can register user buffer // // Outputs: // - nChannels: number of channels (hence SMs) to be used. @@ -82,15 +83,15 @@ typedef struct { // Unset fields will be set automatically by NCCL. ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, - int* nChannels); + int regBuff, int* nChannels); // Terminates the plugin and cleans up any resources that the plugin allocated. // context: tuner context object ncclResult_t (*destroy)(void* context); -} ncclTuner_v3_t; +} ncclTuner_v4_t; -typedef ncclTuner_v3_t ncclTuner_t; +typedef ncclTuner_v4_t ncclTuner_t; -#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3" +#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4" #endif diff --git a/ext-tuner/example/plugin.c b/ext-tuner/example/plugin.c index c3cf00dfd..7925dcfa1 100644 --- a/ext-tuner/example/plugin.c +++ b/ext-tuner/example/plugin.c @@ -12,10 +12,11 @@ __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, - int* nChannels) { + int regBuff, int* nChannels) { // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo - if (collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) { - collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0; + float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; + if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) { + table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0; } *nChannels = 1; return ncclSuccess; @@ -25,7 +26,7 @@ __hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; } #define PLUGIN_NAME "Example" -const ncclTuner_v3_t ncclTunerPlugin_v3 = { +const ncclTuner_v4_t ncclTunerPlugin_v4 = { .name = PLUGIN_NAME, .init = pluginInit, .getCollInfo = pluginGetCollInfo, diff --git a/makefiles/common.mk b/makefiles/common.mk index 59e4151ce..82164ab5c 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -12,6 +12,7 @@ DEBUG ?= 0 ASAN ?= 0 UBSAN ?= 0 TRACE ?= 0 +WERROR ?= 0 PROFAPI ?= 1 NVTX ?= 1 RDMA_CORE ?= 0 @@ -115,6 +116,10 @@ ifeq ($(NVTX), 0) CXXFLAGS += -DNVTX_DISABLE endif +ifneq ($(WERROR), 0) +CXXFLAGS += -Werror +endif + ifneq ($(KEEP), 0) NVCUFLAGS += -keep endif diff --git a/makefiles/version.mk b/makefiles/version.mk index bcc0ff3ce..252300934 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 23 -NCCL_PATCH := 4 +NCCL_MINOR := 24 +NCCL_PATCH := 3 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/Makefile b/src/Makefile index b254eac32..2c5d9e863 100644 --- a/src/Makefile +++ b/src/Makefile @@ -7,17 +7,22 @@ include ../makefiles/common.mk include ../makefiles/version.mk ##### src files -INCEXPORTS := nccl.h nccl_net.h +INCEXPORTS := nccl.h LIBSRCFILES := \ bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \ - init.cc init_nvtx.cc net.cc proxy.cc transport.cc register.cc \ + init.cc init_nvtx.cc net.cc proxy.cc transport.cc \ $(wildcard graph/*.cc) \ $(wildcard misc/*.cc) \ - $(wildcard transport/*.cc) + $(wildcard transport/*.cc) \ + $(wildcard register/*.cc) \ + $(filter-out ras/client.cc,$(wildcard ras/*.cc)) +BINSRCFILES := ras/client.cc ##### lib files LIBNAME := libnccl.so STATICLIBNAME := libnccl_static.a +##### binaries +BINNAME := ncclras ##### pkgconfig files PKGCONFIGFILE := nccl.pc ##### dirs @@ -26,11 +31,12 @@ INCDIR := $(BUILDDIR)/include LIBDIR := $(BUILDDIR)/lib OBJDIR := $(BUILDDIR)/obj PKGDIR := $(BUILDDIR)/lib/pkgconfig +BINDIR := $(BUILDDIR)/bin ##### target files CUDARTLIB ?= cudart_static +# Use compatibility shim only with static cudart; see https://github.com/NVIDIA/nccl/issues/658 ifeq ($(CUDARTLIB), cudart_static) - # Use compatibility shim only with static cudart; see https://github.com/NVIDIA/nccl/issues/658 LIBSRCFILES += enhcompat.cc endif @@ -40,18 +46,21 @@ LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH)) STATICLIBTARGET := $(STATICLIBNAME) PKGTARGET := $(PKGCONFIGFILE) LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o) -DEPFILES := $(LIBOBJ:%.o=%.d) +BINOBJ := $(BINSRCFILES:%.cc=$(OBJDIR)/%.o) +DEPFILES := $(LIBOBJ:%.o=%.d) $(BINOBJ:%.o=%.d) LDFLAGS += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl DEVMANIFEST := $(BUILDDIR)/obj/device/manifest ##### rules -build : lib staticlib +build : lib staticlib binary lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET) staticlib : $(LIBDIR)/$(STATICLIBTARGET) +binary : $(BINDIR)/$(BINNAME) + $(DEVMANIFEST): ALWAYS_REBUILD $(INCTARGETS) $(MAKE) -C ./device @@ -85,6 +94,11 @@ $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVMANIFEST) mkdir -p $(LIBDIR) ar cr $@ $(LIBOBJ) $$(cat $(DEVMANIFEST)) +$(BINDIR)/$(BINNAME): $(BINOBJ) + @printf "Linking %-35s > %s\n" $(BINNAME) $@ + mkdir -p $(BINDIR) + $(CXX) $(CXXFLAGS) $^ -o $@ + $(PKGDIR)/nccl.pc : nccl.pc.in mkdir -p $(PKGDIR) @printf "Generating %-35s > %s\n" $< $@ @@ -121,15 +135,17 @@ $(OBJDIR)/%.o : %.cc $(INCTARGETS) clean : $(MAKE) -C device clean - rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR} + rm -rf ${BINDIR} ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR} install : build mkdir -p $(PREFIX)/lib mkdir -p $(PREFIX)/lib/pkgconfig mkdir -p $(PREFIX)/include + mkdir -p $(PREFIX)/bin cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/ cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/ cp -v $(BUILDDIR)/include/* $(PREFIX)/include/ + cp -v $(BUILDDIR)/bin/ncclras $(PREFIX)/bin/ FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|gdrwrap.h|nccl.h') # Note that formatting.mk defines a new target so in order to not overwrite the default target, diff --git a/src/bootstrap.cc b/src/bootstrap.cc index c1d085e4c..d11e59953 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -13,6 +13,7 @@ #include #include "proxy.h" #include "param.h" +#include "ras.h" #define BOOTSTRAP_N_CHECK_ABORT 10000 #define BOOTSTRAP_TAG_CONNECT (0x1 << 31) @@ -110,13 +111,13 @@ ncclResult_t bootstrapNetInit() { if (nIfs <= 0) { WARN("Bootstrap : no socket interface found"); pthread_mutex_unlock(&bootstrapNetLock); - return ncclInternalError; + return ncclInvalidUsage; } } char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2]; - sprintf(line, " %s:", bootstrapNetIfName); + snprintf(line, sizeof(line), " %s:", bootstrapNetIfName); ncclSocketToString(&bootstrapNetIfAddr, line+strlen(line)); - INFO(NCCL_BOOTSTRAP, "Bootstrap : Using%s", line); + INFO(NCCL_BOOTSTRAP, "Bootstrap: Using%s", line); bootstrapNetInitDone = 1; } pthread_mutex_unlock(&bootstrapNetLock); @@ -152,7 +153,7 @@ static ncclResult_t netIsend(ncclNet_t* net, void* sendComm, void* data, int siz int* done) { if (*done) return ncclSuccess; if (!*sendReq) { - NCCLCHECK(net->isend(sendComm, data, size, tag, dataHandle, sendReq)); + NCCLCHECK(net->isend(sendComm, data, (size_t)size, tag, dataHandle, sendReq)); } if (*sendReq) { NCCLCHECK(net->test(*sendReq, done, NULL)); @@ -166,7 +167,8 @@ static ncclResult_t netIrecv(ncclNet_t* net, void* recvComm, void* data, int siz int* done) { if (*done) return ncclSuccess; if (!*recvReq) { - NCCLCHECK(net->irecv(recvComm, 1, &data, &size, &tag, &dataHandle, recvReq)); + size_t size64 = size; + NCCLCHECK(net->irecv(recvComm, 1, &data, &size64, &tag, &dataHandle, recvReq)); } if (*recvReq) { NCCLCHECK(net->test(*recvReq, done, NULL)); @@ -302,7 +304,7 @@ static void* bootstrapRoot(void* rargs) { // if the number of root > 1, we will receive one extra info from the first local_id of the next root n2send = nRankFromRoot(iroot, nranks, nroots); nrecv = n2send + ((nroots > 1) ? 1 : 0); - NCCLCHECKGOTO(ncclCalloc(&rankInfo, nrecv * sizeof(union ringConnectInfo)), res, out); + NCCLCHECKGOTO(ncclCalloc(&rankInfo, nrecv), res, out); NCCLCHECKGOTO(ncclCalloc(&rankAddressesRoot, nrecv), res, out); } @@ -492,29 +494,37 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) { struct netIf userIfs[MAX_OOB_DEVS]; int nUserIfs = parseStringList(userIfEnv, userIfs, MAX_OOB_DEVS); // loop over the device and return the first one matching - int devId = 0; int nDev = 0; NCCLCHECK(comm->ncclNet->devices(&nDev)); + int devId = 0; while (devId < nDev) { ncclNetProperties_t props; comm->ncclNet->getProperties(devId, &props); // check against user specified HCAs/ports - bool found = matchIfList(props.name, props.port, userIfs, nUserIfs, searchExact) ^ searchNot; - if (found) { + if (matchIfList(props.name, props.port, userIfs, nUserIfs, searchExact) ^ searchNot) { + // All plain physical devices have been initialized at this point devOOB = devId; break; } devId++; } if (devOOB == -1) { - WARN("no device found matching NCCL_OOB_NET_IFNAME=%s, ignoring", userIfEnv); - goto noEnv; + if (!searchNot) + WARN("no device found matching %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv); + else + WARN("no device found after excluding %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv); + pthread_mutex_unlock(&bootstrapNetLock); + return ncclInvalidArgument; } } else { - noEnv: // default choice is device 0 devOOB = 0; } + // display info on the chosen device + ncclNetProperties_t props; + ncclResult_t res = comm->ncclNet->getProperties(devOOB, &props); + bool hasProp = res == ncclSuccess; + INFO(NCCL_BOOTSTRAP, "Bootstrap: Using %s:%d", (hasProp) ? props.name : "N/A", (hasProp) ? props.port : -1); } pthread_mutex_unlock(&bootstrapNetLock); } @@ -545,7 +555,8 @@ static ncclResult_t socketRingConnect(ncclSocketAddress* addr, struct ncclSocket } static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapState* state, union ncclSocketAddress* peerAddresss, - union ncclSocketAddress* peerProxy, uint64_t* peerUDS) { + union ncclSocketAddress* peerProxy, uint64_t* peerUDS, + struct rasRankInit* rasRanks) { ncclResult_t res = ncclSuccess; int rank = comm->rank; int nRanks = comm->nRanks; @@ -553,6 +564,7 @@ static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapState* st union ncclSocketAddress peerAddress; union ncclSocketAddress peerProxy; uint64_t peerUDS; + struct rasRankInit rasRank; }* ringData = NULL; NCCLCHECK(ncclCalloc(&ringData, nRanks)); @@ -563,6 +575,8 @@ static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapState* st memcpy(&(ringData[rank].peerProxy), peerProxy + rank, sizeof(union ncclSocketAddress)); if (peerUDS) memcpy(&(ringData[rank].peerUDS), peerUDS + rank, sizeof(uint64_t)); + if (rasRanks) + memcpy(&(ringData[rank].rasRank), rasRanks + rank, sizeof(*rasRanks)); // allgather NCCLCHECKGOTO(bootstrapAllGather(state, ringData, sizeof(struct bootstrapRingData)), res, exit); @@ -575,6 +589,8 @@ static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapState* st memcpy(peerProxy + irank, &(ringData[irank].peerProxy), sizeof(union ncclSocketAddress)); if (peerUDS) memcpy(peerUDS + irank, &(ringData[irank].peerUDS), sizeof(uint64_t)); + if (rasRanks) + memcpy(rasRanks + irank, &(ringData[irank].rasRank), sizeof(*rasRanks)); } exit: @@ -598,7 +614,10 @@ static ncclResult_t sendToRoot(struct ncclBootstrapHandle* handle, struct ncclCo NCCL_PARAM(StaggerRate, "UID_STAGGER_RATE", 7000); NCCL_PARAM(StaggerThreshold, "UID_STAGGER_THRESHOLD", 256); +NCCL_PARAM(RasEnable, "RAS_ENABLE", 1); + ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) { + ncclResult_t result = ncclSuccess; int rank = comm->rank; int nranks = comm->nRanks; // char nextPeerHandle[NCCL_NET_HANDLE_MAXSIZE]; @@ -607,6 +626,8 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) { struct ncclSocket sock, listenSockRoot; struct extInfo info = {0}; union ringConnectInfo nextPeer; + bool performRasAddRanks = true; + struct rasRankInit* rasRanks = nullptr; uint64_t timers[BOOTSTRAP_INIT_TIME_N] = {0}; @@ -696,23 +717,45 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) { // in case of failure, those resources will be free'd when calling bootstrapDestroy, so we can return immediatly NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks)); NCCLCHECK(ncclCalloc(&proxySocket, 1)); - NCCLCHECK(createListenSocket(comm, comm->magic, proxySocket, state->peerProxyAddresses + rank, ncclSocketTypeProxy)); + NCCLCHECKGOTO(createListenSocket(comm, comm->magic, proxySocket, state->peerProxyAddresses + rank, ncclSocketTypeProxy), result, fail); - NCCLCHECK(ncclCalloc(&state->peerProxyAddressesUDS, nranks)); - NCCLCHECK(getUDS(state->peerProxyAddressesUDS + rank)); + NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddressesUDS, nranks), result, fail); + NCCLCHECKGOTO(getUDS(state->peerProxyAddressesUDS + rank), result, fail); // create a socket for others to reach out (P2P) union ncclSocketAddress peerSocketAddress; - NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, peerSocket), &peerSocketAddress, ncclSocketTypeBootstrap)); - NCCLCHECK(ncclCalloc(&state->peerP2pAddresses, nranks * sizeof(union ncclSocketAddress))); + NCCLCHECKGOTO(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, peerSocket), &peerSocketAddress, ncclSocketTypeBootstrap), result, fail); + NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks), result, fail); memcpy(state->peerP2pAddresses + rank, &peerSocketAddress, sizeof(union ncclSocketAddress)); + // Initialize RAS + if (ncclParamRasEnable() == 1) { + // The RAS thread will take care of freeing the memory allocated below. + NCCLCHECK(ncclCalloc(&rasRanks, nranks)); + memcpy(&rasRanks[rank].addr, &bootstrapNetIfAddr, sizeof(rasRanks[rank].addr)); + rasRanks[rank].pid = getpid(); + rasRanks[rank].cudaDev = comm->cudaDev; + rasRanks[rank].nvmlDev = comm->nvmlDev; + if (ncclRasCommInit(comm, rasRanks+rank) != ncclSuccess) { + INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error"); + // We should still participate in the ringAllInfo below as the peers will be waiting for us. + // Just make sure that the address is clearly invalid... + memset(rasRanks+rank, '\0', sizeof(*rasRanks)); + performRasAddRanks = false; + } + } + BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_RING]); - NCCLCHECK(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS)); + NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS, rasRanks), result, fail); BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_RING]); // Create the service proxy and get the UDS - NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS)); + NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS), result, fail); + + if (ncclParamRasEnable() == 1 && performRasAddRanks) { + if (ncclRasAddRanks(rasRanks, nranks) != ncclSuccess) + INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error"); + } BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_TOTAL]); TRACE(NCCL_BOOTSTRAP, "rank %d nranks %d - DONE", rank, nranks); @@ -722,8 +765,11 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) { timers[BOOTSTRAP_INIT_TIME_RECV] / 1e9, timers[BOOTSTRAP_INIT_TIME_RING] / 1e9, timers[BOOTSTRAP_INIT_TIME_DELAY] / 1e9); - - return ncclSuccess; +exit: + return result; +fail: + free(proxySocket); + goto exit; } ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks) { @@ -761,6 +807,11 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo union ncclSocketAddress peerSocketAddress; NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, peerSocket), &peerSocketAddress, ncclSocketTypeBootstrap)); + if (ncclParamRasEnable() == 1) { + if (ncclRasCommInit(comm, nullptr) != ncclSuccess) + INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error"); + } + // Get addr from next rank using the parent's connections NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, BOOTSTRAP_TAG_COMMSPLIT, &info, sizeof(union ringConnectInfo)), ret, fail); NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, BOOTSTRAP_TAG_COMMSPLIT, &nextPeer, sizeof(union ringConnectInfo)), ret, fail); @@ -773,14 +824,14 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo NCCLCHECK(socketRingConnect(&nextPeer.addr, &STATE_RING(state, socket.send), &STATE_LISTEN(state, socket), &STATE_RING(state, socket.recv), comm->magic, state->abortFlag)); } - NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks * sizeof(union ncclSocketAddress)), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks), ret, fail); memcpy(state->peerP2pAddresses + rank, &peerSocketAddress, sizeof(union ncclSocketAddress)); if (parent->config.splitShare) { /* map local rank to top parent local rank. */ for (int i = 0; i < nranks; ++i) { comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]]; } - NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, NULL, NULL), ret, fail); + NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, NULL, NULL, NULL), ret, fail); } else { NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddresses, nranks), ret, fail); NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddressesUDS, nranks), ret, fail); @@ -788,7 +839,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo NCCLCHECKGOTO(ncclCalloc(&proxySocket, 1), ret, fail); NCCLCHECKGOTO(getUDS(state->peerProxyAddressesUDS + rank), ret, fail); NCCLCHECKGOTO(createListenSocket(comm, comm->magic, proxySocket, state->peerProxyAddresses + rank, ncclSocketTypeProxy), ret, fail); - NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS), ret, fail); + NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS, NULL), ret, fail); NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS), ret, fail); } @@ -811,7 +862,7 @@ static ncclResult_t socketConnect(void* commState, int peer, int tag, struct ncc struct bootstrapState* state = (struct bootstrapState*)commState; struct socketAckInfo ack = (struct socketAckInfo){.rank = state->rank, .tag = tag}; - NCCLCHECKGOTO(ncclSocketInit(sock, state->peerP2pAddresses + peer, state->magic, ncclSocketTypeBootstrap), ret, fail); + NCCLCHECKGOTO(ncclSocketInit(sock, state->peerP2pAddresses + peer, state->magic, ncclSocketTypeBootstrap, state->abortFlag), ret, fail); NCCLCHECKGOTO(ncclSocketConnect(sock), ret, fail); NCCLCHECKGOTO(socketSend(sock, &ack, sizeof(struct socketAckInfo)), ret, fail); return ncclSuccess; diff --git a/src/collectives.cc b/src/collectives.cc index be9468d49..479d4c511 100644 --- a/src/collectives.cc +++ b/src/collectives.cc @@ -44,9 +44,9 @@ const char* ncclDatatypeToString(ncclDataType_t type) { case ncclFloat16: return "ncclFloat16"; case ncclFloat32: return "ncclFloat32"; case ncclFloat64: return "ncclFloat64"; -#if defined(__CUDA_BF16_TYPES_EXIST__) case ncclBfloat16: return "ncclBfloat16"; -#endif + case ncclFloat8e4m3: return "ncclFloat8e4m3"; + case ncclFloat8e5m2: return "ncclFloat8e5m2"; default: return "Unknown"; } } @@ -87,8 +87,7 @@ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcoun struct ncclInfo info = { ncclFuncAllGather, "AllGather", sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */ ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS }; - NCCLCHECK(ncclEnqueueCheck(&info)); - return ncclSuccess; + return ncclEnqueueCheck(&info); } NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count, @@ -111,8 +110,7 @@ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, struct ncclInfo info = { ncclFuncAllReduce, "AllReduce", sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */ ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS }; - NCCLCHECK(ncclEnqueueCheck(&info)); - return ncclSuccess; + return ncclEnqueueCheck(&info); } NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, @@ -133,16 +131,14 @@ ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, n struct ncclInfo info = { ncclFuncBroadcast, "Broadcast", sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */ BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS }; - NCCLCHECK(ncclEnqueueCheck(&info)); - return ncclSuccess; + return ncclEnqueueCheck(&info); } /* Deprecated original "in place" function, similar to MPI */ NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream) { - NCCLCHECK(ncclBroadcast(buff, buff, count, datatype, root, comm, stream)); - return ncclSuccess; + return ncclBroadcast(buff, buff, count, datatype, root, comm, stream); } NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count, @@ -166,8 +162,7 @@ ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, struct ncclInfo info = { ncclFuncReduce, "Reduce", sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */ REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS }; - NCCLCHECK(ncclEnqueueCheck(&info)); - return ncclSuccess; + return ncclEnqueueCheck(&info); } NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount, @@ -189,8 +184,7 @@ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recv struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter", sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */ REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS }; - NCCLCHECK(ncclEnqueueCheck(&info)); - return ncclSuccess; + return ncclEnqueueCheck(&info); } struct NvtxParamsSendRecv { @@ -212,12 +206,7 @@ ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatyp struct ncclInfo info = { ncclFuncSend, "Send", NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ 1, 1 }; - ncclResult_t ret; - NCCLCHECK(ncclGroupStart()); - NCCLCHECKGOTO(ncclEnqueueCheck(&info), ret, exit); -exit: - NCCLCHECK(ncclGroupEnd()); - return ret; + return ncclEnqueueCheck(&info); } NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer, @@ -230,10 +219,5 @@ ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int struct ncclInfo info = { ncclFuncRecv, "Recv", NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ 1, 1 }; - ncclResult_t ret; - NCCLCHECK(ncclGroupStart()); - NCCLCHECKGOTO(ncclEnqueueCheck(&info), ret, exit); -exit: - NCCLCHECK(ncclGroupEnd()); - return ret; + return ncclEnqueueCheck(&info); } diff --git a/src/debug.cc b/src/debug.cc index d21ea3d12..2ea6eabde 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -8,6 +8,7 @@ #include "nccl_net.h" #include #include +#include #include #include #include @@ -89,6 +90,8 @@ static void ncclDebugInit() { mask = NCCL_REG; } else if (strcasecmp(subsys, "PROFILE") == 0) { mask = NCCL_PROFILE; + } else if (strcasecmp(subsys, "RAS") == 0) { + mask = NCCL_RAS; } else if (strcasecmp(subsys, "ALL") == 0) { mask = NCCL_ALL; } @@ -224,6 +227,19 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file } } +NCCL_API(void, ncclResetDebugInit); +void ncclResetDebugInit() { + // Cleans up from a previous ncclDebugInit() and reruns. + // Use this after changing NCCL_DEBUG and related parameters in the environment. + __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE); + if (ncclDebugFile != stdout) { + fclose(ncclDebugFile); + ncclDebugFile = stdout; + } + ncclDebugLevel = -1; + ncclDebugInit(); +} + NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0); void ncclSetThreadName(pthread_t thread, const char *fmt, ...) { diff --git a/src/device/all_gather.h b/src/device/all_gather.h index fb56e483b..5d79d7357 100644 --- a/src/device/all_gather.h +++ b/src/device/all_gather.h @@ -9,64 +9,88 @@ #include "primitives.h" namespace { - template + template __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { ncclRing *ring = &ncclShmem.channel.ring; const int *ringRanks = ring->userRanks; const int nranks = ncclShmem.comm.nRanks; - size_t count, partOffset, partCount, chunkCount; + ssize_t count, partOffset, partCount, chunkCount; ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &partOffset, &partCount, &chunkCount); - size_t offset; - size_t dataOffset; + ssize_t offset; + ssize_t dataOffset; int nelem; int rankDest; - + int workNthreads; T *inputBuf = (T*)work->sendbuff; T *outputBuf = (T*)work->recvbuff; - // Coverity reports that the callee treats &ring->next as an array. However, due to the use of - // FanSymmetric<1>, only the first element is ever accessed, so it's fine. - // coverity[callee_ptr_arith:FALSE] - Primitives, 1, Proto, 0> prims - (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work); - for (size_t elemOffset = 0; elemOffset < partCount; elemOffset += chunkCount) { - /////////////// begin AllGather steps /////////////// - nelem = min(chunkCount, partCount - elemOffset); - dataOffset = partOffset + elemOffset; + // If isNetOffload == true, we only use 1 warp to drive Ring algo/network communication + // and the rest of warps proceed to copy src data into dst buffer in parallel when AG + // is not in-place. + if (isNetOffload) { + workNthreads = WARP_SIZE; + chunkCount = NCCL_MAX_NET_SIZE; + } else { + workNthreads = nthreads; + } - // step 0: push data to next GPU - rankDest = ringRanks[0]; - offset = dataOffset + rankDest * count; + if (tid < workNthreads) { + // Coverity reports that the callee treats &ring->next as an array. However, due to the use of + // FanSymmetric<1>, only the first element is ever accessed, so it's fine. + // coverity[callee_ptr_arith:FALSE] + Primitives, 1, Proto, 0, isNetOffload> prims + (tid, workNthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work, NULL, isNetOffload ? NCCL_MAX_NET_SIZE : 0); + for (size_t elemOffset = 0; elemOffset < partCount; elemOffset += chunkCount) { + /////////////// begin AllGather steps /////////////// + nelem = min(chunkCount, partCount - elemOffset); + dataOffset = partOffset + elemOffset; + + // step 0: push data to next GPU + rankDest = ringRanks[0]; + offset = dataOffset + rankDest * count; - if (inputBuf + dataOffset == outputBuf + offset) { // In place - prims.directSend(dataOffset, offset, nelem); - } else { - prims.directCopySend(dataOffset, offset, nelem); - } + if ((inputBuf + dataOffset == outputBuf + offset) || isNetOffload) { // In place or onePPN + prims.directSend(dataOffset, offset, nelem); + } else { + prims.directCopySend(dataOffset, offset, nelem); + } + + // k-2 steps: copy to next GPU + for (int j = 1; j < nranks - 1; ++j) { + rankDest = ringRanks[nranks - j]; + offset = dataOffset + rankDest * count; + prims.directRecvCopyDirectSend(offset, offset, nelem); + } - // k-2 steps: copy to next GPU - for (int j=1; j + (tid - workNthreads, nthreads - workNthreads, work->redOpArg, &work->redOpArg, false, 1, (void**)&inputBuf, 1, (void**)&outputBuf, partCount); } + // we have to wait for all warps before we can proceed to the next work; + // otherwise, we can have contention if next work will use the outputBuf + // in this work. We use bar 14 to avoid conflicts with prims barrier and + // __syncthread(). + if (isNetOffload) barrier_sync(14, nthreads); } } template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { - using Proto = ProtoSimple; - runRing(tid, nthreads, work); + bool isNetOffload = work->isOneRPN && work->netRegUsed; + if (isNetOffload) + runRing, true>(tid, nthreads, work); + else + runRing, false>(tid, nthreads, work); } }; @@ -96,7 +120,7 @@ struct RunWorkCollsendbuff; T *outputBuf = (T*)work->recvbuff; Primitives, 0, Proto, 0> prims - (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, false, false, 0, primsModePatAg); + (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatAg); PatAGAlgorithm patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks); int last = 0; @@ -137,6 +161,7 @@ struct RunWorkCollnHeads * count, nelem, count, -1, 0); } + // coverity[overrun-call] => Coverity think prims.index can be greater than 1 } else if (tid < tidEndBcast) { // Bcast through NVLS using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; @@ -148,6 +173,7 @@ struct RunWorkColl Coverity think prims.index can be greater than 1 } } else { /* direct allgather */ @@ -204,11 +230,11 @@ struct RunWorkCollchannelLo; char* inbuf = (char*)work->sendbuff; char* outbuf = (char*)work->recvbuff; - ssize_t sizePerRank = work->collnet.count*sizeof(T); - bool inPlace = (inbuf == outbuf + ncclShmem.comm.rank*sizePerRank); + ssize_t countPerRank = work->collnet.count*sizeof(T); + bool inPlace = (inbuf == outbuf + ncclShmem.comm.rank*countPerRank); - ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*sizePerRank); - ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank); + ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*countPerRank); + ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*countPerRank); int railAllSize = railAllEnd - railAllBeg; if (tid < nDsts) dstSizes[tid] = railAllSize; @@ -221,15 +247,15 @@ struct RunWorkColl (tid, tn, 0, nullptr, false, /*nSrcs=*/1, [=]__device__(int s/*==0*/) -> void* { - return work->regUsed && (recvDirectFlag & NCCL_DIRECT_READ) ? (char*)srcPtrs[src] + userOneBeg : (char*)srcPtrs[src] + railAllOffset; + return work->regUsed && (recvDirectFlag & NCCL_P2P_READ) ? (char*)srcPtrs[src] + userOneBeg : (char*)srcPtrs[src] + railAllOffset; }, /*nDsts=*/outIsDst+nDsts, [=]__device__(int d) -> void* { return d < outIsDst ? outbuf + userOneBeg - : work->regUsed && (sendDirectFlag & NCCL_DIRECT_WRITE) ? (char*)dstPtrs[d-outIsDst] + userOneBeg + : work->regUsed && (sendDirectFlag & NCCL_P2P_WRITE) ? (char*)dstPtrs[d-outIsDst] + userOneBeg : (char*)dstPtrs[d-outIsDst] + railAllOffset; }, delta); @@ -262,8 +288,9 @@ struct RunWorkCollchannelHi - work->channelLo + 1; struct ncclDirect* direct = &ncclShmem.channel.collnetDirect; int const &nNodes = ncclShmem.comm.nNodes; - ssize_t sizePerRank = work->collnet.count*sizeof(T); + ssize_t countPerRank = work->collnet.count; size_t chunkSize = work->collnet.chunkCount; + const int hasDn = (direct->down[0] >= 0) ? 1 : 0; bool isMultiRail = (direct->nHeads > 1); int nWarps1 = 1; int nWarps2 = (isMultiRail ? 2 : 1); @@ -277,9 +304,12 @@ struct RunWorkCollregUsed == NCCL_COLLNET_REG_BUFFER) { + if (work->netRegUsed) { if (tid == 0) { - int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE); + // If this rank has local peers (i.e, hasDn == true), we cannot offload all data to network. + // In this case, steps should be computed based on chunkSize and so on; otherwise, we just + // bump the step by 1 to kick off collnet progress. + int steps = hasDn ? (int)divUp(nNodes * countPerRank, nChannels * chunkSize) : 1; Primitives, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps); } __syncwarp(); @@ -288,11 +318,11 @@ struct RunWorkColl, /*Direct=*/0, Proto, 0> prims(tid, tn, nullptr, &direct->out, work->sendbuff, nullptr, /*redOpArg=*/0, 0 * Proto::MaxGroupWidth, 1, 1); - for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) { + for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkSize) { ssize_t railAllBeg = railGridOffset + part * chunkSize; - ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank); - ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank; - ssize_t railOneEnd = railOneBeg + sizePerRank; + ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * countPerRank); + ssize_t railOneBeg = ncclShmem.comm.node * countPerRank; + ssize_t railOneEnd = railOneBeg + countPerRank; ssize_t beg = max(railAllBeg, railOneBeg); ssize_t end = min(railAllEnd, railOneEnd); prims.send(beg - railOneBeg, max(ssize_t(0), end - beg)); @@ -304,10 +334,9 @@ struct RunWorkCollregUsed == NCCL_COLLNET_REG_BUFFER) { + if (work->netRegUsed && !hasDn) { if (tid == 0) { - int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE); - Primitives, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps); + Primitives, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, 1); } __syncwarp(); } else { @@ -315,7 +344,7 @@ struct RunWorkColl, /*Direct=*/1, Proto, 0> prims(tid, tn, &direct->out, direct->heads + 1, nullptr, work->recvbuff, /*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0, work); - for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) { + for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkSize) { Scatterer scat; scat.work = work; scat.chunkSize = chunkSize; @@ -333,7 +362,7 @@ struct RunWorkColl, /*Direct=*/1, Proto, 0> prims(tid, tn, direct->heads+1, nullptr, nullptr, work->recvbuff, /*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0, work); - for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) { + for (ssize_t railGridOffset=0; railGridOffset < nNodes*countPerRank; railGridOffset += nChannels*chunkSize) { Scatterer scat; scat.work = work; scat.chunkSize = chunkSize; diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h index 36b8d3206..c6c131517 100644 --- a/src/device/all_reduce.h +++ b/src/device/all_reduce.h @@ -69,7 +69,7 @@ namespace { chunkOffset = chunk * chunkCount; offset = gridOffset + elemOffset + chunkOffset; nelem = (int)min(chunkCount, remCount - chunkOffset); - prims.directRecvCopyDirectSend(offset, nelem); + prims.directRecvCopyDirectSend(offset, offset, nelem); } // Make final copy from buffer to dest. @@ -139,7 +139,7 @@ namespace { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); - prims.directRecvCopyDirectSend(offset, nelem); + prims.directRecvCopyDirectSend(offset, offset, nelem); } } } @@ -222,7 +222,7 @@ namespace { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); - prims.directRecvCopyDirectSend(offset, nelem); + prims.directRecvCopyDirectSend(offset, offset, nelem); } } } @@ -268,22 +268,30 @@ struct RunWorkColl; if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) { // Scatter - Primitives, /*Direct=*/0, Proto, 0> + Primitives, /*Direct=*/1, Proto, 0> prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, work->sendbuff, work->recvbuff, - work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1); + work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, work); + ssize_t offsetBase, peerOffset; + ssize_t maxNelems; + if (work->netRegUsed) { + offsetBase = bid * chunkSize; + maxNelems = size; // never be the min + peerOffset = nChannels * chunkSize; + } else { + offsetBase = bid * direct->nHeads * chunkSize; + maxNelems = direct->nHeads * chunkSize; + peerOffset = chunkSize; + } + // For collnet UB case, we need to organize buffers differently for contiguous buffer access + // across channels. This access pattern should be consistent with code in coll_net.cc for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize; - int nelem = min(direct->nHeads*chunkSize, size-offset); - if (work->regUsed) { - prims.directScatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift); - } else { - prims.scatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift); - } + ssize_t offset = gridOffset + offsetBase; + ssize_t nelem = min(maxNelems, size - offset); + prims.scatter(offset, nelem, chunkSize, peerOffset, direct->headRank, direct->shift); } // Coverity complains about a possible overrun inside the destructor of "prims", but that's actually // a false positive. @@ -291,24 +299,20 @@ struct RunWorkColl= tidStartReduce && direct->out != -1) { if (hasDn) { // Reduce, send to network - Primitives, /*Direct=*/0, Proto, 0> + Primitives, /*Direct=*/1, Proto, 0> prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, work->sendbuff, work->recvbuff, - work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1); + work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; - int nelem = min(chunkSize, size-offset); - if (work->regUsed) { - prims.directRecvReduceSend(offset, nelem); - } else { - prims.recvReduceSend(offset, nelem); - } + ssize_t offset = work->netRegUsed ? gridOffset + (bid + direct->headRank * nChannels) * chunkSize + : gridOffset + (bid * direct->nHeads + direct->headRank) * chunkSize; + int nelem = min(chunkSize, size - offset); + prims.recvReduceDirectSend(offset, offset, nelem); } } else { // Directly send to network - if (work->regUsed == NCCL_COLLNET_REG_BUFFER) { + if (work->netRegUsed) { if (tid == tidStartReduce) { - int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE); - Primitives, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps); + Primitives, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, 1); } __syncwarp(); } else { @@ -316,8 +320,8 @@ struct RunWorkCollout, work->sendbuff, work->recvbuff, work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; - int nelem = min(chunkSize, size-offset); + ssize_t offset = gridOffset + (bid * direct->nHeads + direct->headRank) * chunkSize; + int nelem = min(chunkSize, size - offset); prims.send(offset, nelem); } } @@ -327,10 +331,21 @@ struct RunWorkColl, /*Direct=*/1, Proto, 0> prims(tid, nThreadsGather, direct->up, NULL, work->sendbuff, work->recvbuff, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, work); + ssize_t offsetBase, peerOffset; + ssize_t maxNelems; + if (work->netRegUsed) { + offsetBase = bid * chunkSize; + maxNelems = size; // never be the min + peerOffset = nChannels * chunkSize; + } else { + offsetBase = bid * direct->nHeads * chunkSize; + maxNelems = direct->nHeads * chunkSize; + peerOffset = chunkSize; + } for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize; - int nelem = min(direct->nHeads*chunkSize, size-offset); - prims.directGather(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift); + ssize_t offset = gridOffset + offsetBase; + ssize_t nelem = min(maxNelems, size - offset); + prims.directGather(offset, nelem, chunkSize, peerOffset, direct->headRank, direct->shift); } } else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) { if (hasDn) { @@ -342,15 +357,15 @@ struct RunWorkCollout, direct->down, work->sendbuff, work->recvbuff, work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize; - int nelem = min(chunkSize, size-offset); - prims.recvCopyDirectSend(offset, nelem, /*postOp=*/true); + ssize_t offset = work->netRegUsed ? gridOffset + (bid + direct->headRank * nChannels) * chunkSize + : gridOffset + (bid * direct->nHeads + direct->headRank) * chunkSize; + int nelem = min(chunkSize, size - offset); + prims.directRecvCopyDirectSend(offset, offset, nelem, /*postOp=*/true); } } else { - if (work->regUsed == NCCL_COLLNET_REG_BUFFER) { + if (work->netRegUsed) { if (tid == tidStartBcast) { - int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE); - Primitives, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps); + Primitives, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, 1); } __syncwarp(); } else { @@ -394,8 +409,6 @@ struct RunWorkCollnHeads * chunkSize; - ssize_t offset; - int nelem; int remCount = channelCount%(nvls->nHeads*chunkSize); int lastChunkSize = alignUp(divUp(remCount, nvls->nHeads), 16384/sizeof(T)); @@ -407,8 +420,8 @@ struct RunWorkCollredOpArg, 0 * Proto::MaxGroupWidth, 1, 1); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize; - offset = gridOffset + elemOffset; - nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset); + ssize_t offset = gridOffset + elemOffset; + int nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset); prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0); } } else if (tid < tidEndGather) { @@ -419,8 +432,8 @@ struct RunWorkCollredOpArg, 1 * Proto::MaxGroupWidth, 1, 1); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize; - offset = gridOffset + elemOffset; - nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset); + ssize_t offset = gridOffset + elemOffset; + int nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset); prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0); } } else if (tid < tidEndReduce) { @@ -430,7 +443,8 @@ struct RunWorkColldown, &nvls->down, NULL, NULL, work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work); for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) { - ssize_t chunkOffset; + ssize_t chunkOffset, offset; + int nelem; if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize; chunkOffset = elemOffset + nvls->headRank * chunkSize; offset = gridOffset + chunkOffset; @@ -456,6 +470,7 @@ struct RunWorkCollregUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset); prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0); } + // coverity[overrun-call] => Coverity think prims.index can be greater than 1 } else if (tid < tidEndGather) { // Gather using Proto = ProtoSimple<1, 1, COLL_UNROLL>; @@ -464,38 +479,23 @@ struct RunWorkCollredOpArg, 1 * Proto::MaxGroupWidth, 1, 1); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize; - int nelem = work->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset); + int nelem = work->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset); prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0); } } else if (tid < tidEndReduce && nvls->headRank != -1) { - if (!hasOut) { - // Reduce, broadcast through NVLS - using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>; - // Coverity complains about a possible overrun inside the class below, but that's actually - // a false positive. - // coverity[identity_transfer:FALSE] - Primitives, /*Direct=*/1, Proto, 0> - prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL, - work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work); - for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize; - int nelem = min(chunkSize, size - offset); - prims.directRecvDirectSend(offset, offset, nelem); - } - } else { - // Reduce, send to network - using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>; - // Coverity complains about a possible overrun inside the class below, but that's actually - // a false positive. - // coverity[identity_transfer:FALSE] - Primitives, /*Direct=*/1, Proto, 0> - prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL, - work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work); - for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize; - int nelem = min(chunkSize, size - offset); - prims.directRecvDirectSend(offset, offset, nelem); - } + // Reduce, send to network + using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>; + // Coverity complains about a possible overrun inside the class below, but that's actually + // a false positive. + // coverity[identity_transfer:FALSE] + Primitives, /*Direct=*/1, Proto, 0> + prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, work->recvbuff, + work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t offset = work->regUsed && work->netRegUsed ? gridOffset + (nvls->headRank * nChannels + bid) * chunkSize + : gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize; + int nelem = min(chunkSize, size - offset); + prims.directRecvDirectSend(offset, offset, nelem); } } else if (tid < tidEndBcast && nvls->headRank != -1) { // Recv from network, broadcast @@ -504,10 +504,11 @@ struct RunWorkColl, /*Direct=*/1, Proto, 0> - prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL, + prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, work->recvbuff, work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize; + ssize_t offset = work->regUsed && work->netRegUsed ? gridOffset + (nvls->headRank * nChannels + bid) * chunkSize + : gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize; int nelem = min(chunkSize, size - offset); prims.directRecvDirectSend(offset, offset, nelem); } @@ -660,10 +661,9 @@ struct RunWorkCollregUsed == NCCL_COLLNET_REG_BUFFER) { + if (work->netRegUsed) { if (groupTid == 0) { - int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE); - Primitives, /*Direct=*/1, Proto, 0>::sendPeerNotify(send, connIndex, steps); + Primitives, /*Direct=*/1, Proto, 0>::sendPeerNotify(send, connIndex, 1); } __syncwarp(); } else { @@ -673,8 +673,10 @@ struct RunWorkColl Coverity think prims.index can be greater than 1 prims.directSend(offset, offset, nelem); } + // coverity[overrun-call] => Coverity think prims.index can be greater than 1 } } else { Primitives, /*Direct=*/1, Proto, 0> @@ -683,18 +685,19 @@ struct RunWorkColl Coverity think prims.index can be greater than 1 prims.directRecvReduceDirectSend(offset, offset, nelem); } + // coverity[overrun-call] => Coverity think prims.index can be greater than 1 } } else { if (recv == nranks) { // I'm the first in the broadcast chain, I need to perform the division (postOp) if (send == -1) { - if (work->regUsed == NCCL_COLLNET_REG_BUFFER) { + if (work->netRegUsed) { if (groupTid == 0) { - int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE); - Primitives, /*Direct=*/1, Proto, 0>::recvPeerNotify(recv, connIndex, steps); + Primitives, /*Direct=*/1, Proto, 0>::recvPeerNotify(recv, connIndex, 1); } __syncwarp(); } else { @@ -720,7 +723,7 @@ struct RunWorkColluserRanks[0]; const int nextRank = ring->userRanks[1]; const int root = work->root; - size_t chunkCount; - size_t channelCount; - size_t gridOffset; - ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount); + ssize_t chunkCount; + ssize_t channelCount; + ssize_t gridOffset; + ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkCount); size_t offset; int nelem; + int workNthreads; + bool isNetOffload = work->isOneRPN && work->netRegUsed; T *inputBuf = (T*)work->sendbuff; T *outputBuf = (T*)work->recvbuff; - // Coverity reports that the callee treats &ring->next as an array. However, due to the use of - // FanSymmetric<1>, only the first element is ever accessed, so it's fine. - // coverity[callee_ptr_arith:FALSE] - Primitives, 1, Proto, 0> - prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work); + workNthreads = isNetOffload ? WARP_SIZE : nthreads; - for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { - offset = gridOffset + elemOffset; - nelem = min(chunkCount, channelCount - elemOffset); + if (tid < workNthreads) { + // Coverity reports that the callee treats &ring->next as an array. However, due to the use of + // FanSymmetric<1>, only the first element is ever accessed, so it's fine. + // coverity[callee_ptr_arith:FALSE] + Primitives, 1, Proto, 0> + prims(tid, workNthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work); - if (rank == root) { - if (inputBuf == outputBuf) { - prims.directSend(offset, offset, nelem); + for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { + offset = gridOffset + elemOffset; + nelem = min(chunkCount, channelCount - elemOffset); + + if (rank == root) { + if (inputBuf == outputBuf || isNetOffload) { + prims.directSend(offset, offset, nelem); + } else { + prims.directCopySend(offset, offset, nelem); + } + } else if (nextRank == root) { + prims.directRecv(offset, offset, nelem); } else { - prims.directCopySend(offset, offset, nelem); + prims.directRecvCopyDirectSend(offset, offset, nelem); } - } else if (nextRank == root) { - prims.directRecv(offset, offset, nelem); - } else { - prims.directRecvCopyDirectSend(offset, nelem); } + } else if (inputBuf != outputBuf && rank == root) { + inputBuf = inputBuf + gridOffset; + outputBuf = outputBuf + gridOffset; + reduceCopy + (tid - workNthreads, nthreads - workNthreads, work->redOpArg, &work->redOpArg, false, 1, (void**)&inputBuf, 1, (void**)&outputBuf, channelCount); } + if (isNetOffload) barrier_sync(14, nthreads); } } diff --git a/src/device/common.h b/src/device/common.h index 967421b7d..05465ff5a 100644 --- a/src/device/common.h +++ b/src/device/common.h @@ -396,6 +396,9 @@ __device__ void ncclDevFunc_Nop(); ncclKernelMain, algo, proto>>(&args4K.args); \ } +#define DEFINE_ncclDevKernel_nop(suffix, coll, redop, ty, algo, proto, specializedFnId) \ + __global__ void ncclDevKernel_##suffix(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {} + #define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto) \ __device__ void ncclDevFunc_##suffix() { \ RunWorkBatch, algo, proto>().run(); \ diff --git a/src/device/common_kernel.h b/src/device/common_kernel.h index f932f51f0..00bb1e333 100644 --- a/src/device/common_kernel.h +++ b/src/device/common_kernel.h @@ -65,19 +65,23 @@ __device__ __forceinline__ void reduceCopyPacks( uintptr_t minSrcs[MinSrcs + !MinSrcs]; uintptr_t minDsts[MinDsts + !MinDsts]; #pragma unroll - for (int s=0; s < MinSrcs; s++) + for (int s=0; s < MinSrcs; s++) { minSrcs[s] = cvta_to_global(srcPtrFn(s)) + threadBytesBehind; + } + #pragma unroll - for (int d=0; d < MinDsts; d++) + for (int d=0; d < MinDsts; d++) { // Yes, for some template arguments this code will be unreachable. That's fine. // coverity[dead_error_line] minDsts[d] = cvta_to_global(dstPtrFn(d)) + threadBytesBehind; + } // We dictate loop termination condition according to whether partial hunks // can be handled or not. while (Unroll==1 ? (BytePerPack <= threadBytesAhead) : (0 < nHunksAhead)) { BytePack acc[Unroll]; + // minSrcs[0] cannot be nullptr so we always process it { RedFn preFn(0 < PreOpSrcs ? preOpArgs[0] : 0); #pragma unroll Unroll for (int u=0; u < Unroll; u++) { @@ -163,7 +167,8 @@ __device__ __forceinline__ void reduceCopyPacks( } } for (int d=MinDsts; (MinDsts < MaxDsts) && (d < MaxDsts) && (d < nDsts); d++) { - uintptr_t dst = cvta_to_global(dstPtrFn(d)) + threadBytesBehind; + uintptr_t dstPtr = cvta_to_global(dstPtrFn(d)); + uintptr_t dst = dstPtr + threadBytesBehind; #pragma unroll Unroll for (int u=0; u < Unroll; u++) { st_global(dst, acc[u]); @@ -173,11 +178,15 @@ __device__ __forceinline__ void reduceCopyPacks( nWarps = nThreads/WARP_SIZE; #pragma unroll - for (int s=0; s < MinSrcs; s++) minSrcs[s] += (nWarps-1)*BytePerHunk; + for (int s=0; s < MinSrcs; s++) { + minSrcs[s] += (nWarps-1)*BytePerHunk; + } #pragma unroll // Yes, for some template arguments this code will be unreachable. That's fine. // coverity[dead_error_line] - for (int d=0; d < MinDsts; d++) minDsts[d] += (nWarps-1)*BytePerHunk; + for (int d=0; d < MinDsts; d++) { + minDsts[d] += (nWarps-1)*BytePerHunk; + } threadBytesBehind += nWarps*BytePerHunk; threadBytesAhead -= nWarps*BytePerHunk; nHunksAhead -= nWarps; diff --git a/src/device/generate.py b/src/device/generate.py index a0d225946..b69a2d7cc 100755 --- a/src/device/generate.py +++ b/src/device/generate.py @@ -5,7 +5,7 @@ # Order of redops, tys, protos, algos must match src/include/device.h all_colls = ["Broadcast","Reduce","AllGather","ReduceScatter","AllReduce","SendRecv"] all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"] -all_tys = ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16"] +all_tys = ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16","f8e4m3","f8e5m2"] all_protos = ["LL","LL128","SIMPLE"] all_algos = ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE","PAT"] @@ -107,6 +107,9 @@ def required_cuda(coll, redop, ty, algo, proto): if coll in ("AllReduce","Reduce","ReduceScatter"): if redop=="SumPostDiv" and ty[0] not in ("i","u"): return None if ty=="bf16": cudart = max(cudart, 11000) + if ty.startswith("f8"): + cudart = max(cudart, 11080) + arch = max(arch, 900) if "NVLS" in algo: if coll in ("AllReduce","Reduce","ReduceScatter"): @@ -125,7 +128,7 @@ def required_cuda(coll, redop, ty, algo, proto): def equivalent_primary(coll, redop, ty, algo, proto): if coll in ("AllReduce", "Reduce", "ReduceScatter"): # map signed integer sum/prod to unsigned - if redop in ("Sum","Prod","PreMulSum") and ty[0]=="i": + if redop in ("Sum","Prod","PreMulSum","SumPostDiv") and ty[0]=="i": return (coll, redop, "u"+ty[1:], algo, proto) # map signed integer min/max to unsigned for non-NVLS if redop=="MinMax" and ty[0]=="i" and ("NVLS" not in algo): @@ -365,7 +368,9 @@ def partition_by_name(fns): "f16": "half", "f32": "float", "f64": "double", - "bf16": "__nv_bfloat16" + "bf16": "__nv_bfloat16", + "f8e4m3": "__nv_fp8_e4m3", + "f8e5m2": "__nv_fp8_e5m2" } # Generate each /.cu: @@ -385,15 +390,23 @@ def partition_by_name(fns): sym = paste("_", coll, redop, ty, algo, proto) fn_id = primary_to_index[kfn] cudart, arch = required_cuda(*kfn) + s = "DEFINE_ncclDevKernel({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n" if (cudart, arch) != (0, 0): - out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch)) - out( - "DEFINE_ncclDevKernel({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n" - .format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty], - algo=(algo or "RING"), proto=(proto or "SIMPLE"), fn_id=fn_id) - ) - if (cudart, arch) != (0, 0): - out("#endif\n") + # Add conditional compilation logic around s. If CUDART_VERSION is satisfactory + # we must compile a kernel regardless of __CUDA_ARCH__ since the host code has + # to link against some stub. + s = "#if CUDART_VERSION >= {cudart}\n" \ + " #if __CUDA_ARCH__ < {arch}\n" \ + " DEFINE_ncclDevKernel_nop({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n" \ + " #else\n" \ + " " + s + \ + " #endif\n" \ + "#endif\n" + out(s.format( + cudart=cudart, arch=arch, sym=sym, coll=coll, + redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty], + algo=(algo or "RING"), proto=(proto or "SIMPLE"), fn_id=fn_id + )) for fn in fns: (coll, redop, ty, algo, proto) = fn diff --git a/src/device/network/unpack/unpack.h b/src/device/network/unpack/unpack.h index e76099821..941b4328d 100644 --- a/src/device/network/unpack/unpack.h +++ b/src/device/network/unpack/unpack.h @@ -33,17 +33,21 @@ inline __device__ void load64gpu(const uint64_t* ptr, uint64_t &v) { // Map internal association of handle with group and peer index (called once at init time) inline __device__ void ncclNetDeviceUnpackSetup(void* ohandle, const int group, const int index) { struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle; + // coverity[index_parm:FALSE] ncclShmem.groups[group].devicePlugin.unpack.g_meta[index] = handle->meta; ncclShmem.devicePlugin.unpack.bounce_buf = handle->bounce_buf; + // coverity[index_parm:FALSE] ncclShmem.groups[group].devicePlugin.unpack.head[index] = handle->head; } inline __device__ void ncclNetDeviceIncrementHead(const int group, const int index) { + // coverity[index_parm:FALSE] ncclShmem.groups[group].devicePlugin.unpack.head[index]++; } inline __device__ void ncclNetDeviceSaveHead(void* ohandle, const int group, const int index) { struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle; + // coverity[index_parm:FALSE] handle->head = ncclShmem.groups[group].devicePlugin.unpack.head[index]; } diff --git a/src/device/onerank.cu b/src/device/onerank.cu index 5ff4a85b1..c187dcc44 100644 --- a/src/device/onerank.cu +++ b/src/device/onerank.cu @@ -62,6 +62,10 @@ ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct case ncclUint32: kernel = (void const*)&oneRankReduce>; break; case ncclInt64: kernel = (void const*)&oneRankReduce>; break; case ncclUint64: kernel = (void const*)&oneRankReduce>; break; + #if defined(__CUDA_FP8_TYPES_EXIST__) && __CUDA_ARCH__ >= 900 + case ncclFloat8e4m3: kernel = (void const*)&oneRankReduce>; break; + case ncclFloat8e5m2: kernel = (void const*)&oneRankReduce>; break; + #endif case ncclFloat16: kernel = (void const*)&oneRankReduce>; break; #if defined(__CUDA_BF16_TYPES_EXIST__) case ncclBfloat16: kernel = (void const*)&oneRankReduce>; break; diff --git a/src/device/primitives.h b/src/device/primitives.h index 1913640e8..73c10c264 100644 --- a/src/device/primitives.h +++ b/src/device/primitives.h @@ -103,7 +103,7 @@ struct FanSymmetric { }; // The primitives class. Specialized per protocol in the other headers. -template +template class Primitives; // Used by LL & LL128 to implement direct members in the naive way. @@ -121,9 +121,12 @@ struct PrimitivesWithoutDirect { __device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { static_cast(this)->copySend(inpIx, outIx, eltN, postOp); } - __device__ void directRecvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) { + __device__ void directRecvCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { static_cast(this)->recvCopySend(outIx, eltN, /*postOp=*/false); } + __device__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { + return; + } __device__ void recvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { // Direct is only for the send part static_cast(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp); diff --git a/src/device/prims_ll.h b/src/device/prims_ll.h index 1a1307f5c..3e00f3b85 100644 --- a/src/device/prims_ll.h +++ b/src/device/prims_ll.h @@ -4,9 +4,9 @@ * See LICENSE.txt for license information ************************************************************************/ -template -class Primitives: - public PrimitivesWithoutDirect> { +template +class Primitives: + public PrimitivesWithoutDirect> { // In the case of Fan::MaxRecv == 0, we need to force MaxRecv to 1 for this to compile // This is because of a recv buffer which is allocated to MaxRecv length in send-only cases diff --git a/src/device/prims_ll128.h b/src/device/prims_ll128.h index 2cb10cc49..617b7acf3 100644 --- a/src/device/prims_ll128.h +++ b/src/device/prims_ll128.h @@ -8,9 +8,9 @@ #define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1) -template -class Primitives: - public PrimitivesWithoutDirect> { +template +class Primitives: + public PrimitivesWithoutDirect> { static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend; static constexpr int Input=0, Output=1; diff --git a/src/device/prims_simple.h b/src/device/prims_simple.h index 945878b76..005101940 100644 --- a/src/device/prims_simple.h +++ b/src/device/prims_simple.h @@ -14,9 +14,9 @@ enum primsMode { }; template + int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts, bool isNetOffload> class Primitives< - T, RedOp, Fan, Direct, ProtoSimple, P2p + T, RedOp, Fan, Direct, ProtoSimple, P2p, isNetOffload > { static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend; static constexpr int Input=0, Output=1; @@ -34,11 +34,7 @@ class Primitives< PatMode = 0x800, NvlsMinPolling = 0x1000, NetDeviceUnpack = 0x2000, - AnyNetDeviceUnpack = 0x4000, - NvlsDirectRead = 0x8000, - NvlsDirectWrite = 0x10000, - IpcWrite = 0x20000, - IpcRead = 0x40000; + AnyNetDeviceUnpack = 0x4000; const int tid, tidInBlock; const int nthreads; int nworkers; @@ -119,12 +115,9 @@ class Primitives< template __device__ __forceinline__ void waitPeer(intptr_t srcIx, intptr_t dstIx, int offset, int nelts) { const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send; - const bool noRecvWait = DirectRecv && Src && (flags & (DirectRead | IpcRead)); // no wait when directly reading from remote input - const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write // Yes, for some template arguments this code will be unreachable. That's fine. // coverity[dead_error_line] - if (((flags & (Recv*RoleWaitRecv)) && !noRecvWait) || - ((flags & (Send*RoleWaitSend)) && !noSendWait)) { + if ((flags & (Recv * RoleWaitRecv)) || (flags & (Send * RoleWaitSend))) { int spins = 0; while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) { connStepCache = loadStepValue(connStepPtr); @@ -134,27 +127,38 @@ class Primitives< } if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) { - if (flags & ConnFifoEnabled) + if ((flags & ConnFifoEnabled) && (flags & (Send * RoleWaitSend))) connFifo[step%NCCL_STEPS].size = nelts*sizeof(T); void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst) : (ncclShmem.groups[group].srcs + Src); if (flags & NetRegMode) { - // Do nothing + if (P2p) { + ptrs[index] = NULL; + } else { + if (isSendNotRecv) { + if (!Recv) + ptrs[index] = NULL; + else + ptrs[index] = (T*)ncclShmem.groups[group].userOutput + dstIx + offset; + } else { + ptrs[index] = (T*)ncclShmem.groups[group].userOutput + srcIx + offset; + } + } } else if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) { ptrs[index] = connEltsFifo + loadInt(&connFifo[step%NCCL_STEPS].offset)/sizeof(T); } else if (isSendNotRecv && DirectSend) { - if (flags & (DirectWrite | NvlsDirectWrite | IpcWrite)) { + if (flags & DirectWrite) { ptrs[index] = directBuff + dstIx + offset; - } else if ((flags & DirectRead) || (flags & IpcRead)) { // empty send + } else if (flags & DirectRead) { // empty send ptrs[index] = nullptr; } else { ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize; } } else if (!isSendNotRecv && DirectRecv) { - if (flags & (DirectRead | NvlsDirectRead | IpcRead)) { + if (flags & DirectRead) { ptrs[index] = directBuff + srcIx + offset; - } else if ((flags & DirectWrite) || (flags & IpcWrite)) { + } else if (flags & DirectWrite) { ptrs[index] = directBuff + dstIx + offset; // send to next from my output buffer } else { ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize; @@ -198,7 +202,7 @@ class Primitives< int slice = 0; int offset = 0; - if (tid < nworkers && offset < nelem && ((flags & NetRegMode) == 0)) { + if (tid < nworkers && offset < nelem && !isNetOffload) { // Worker-only loop for non-empty slices. Non-workers and empty slices are // processed in the loop following this if block. The benefit of splitting // the loop like this is we pull two branches out of the critical path. @@ -252,7 +256,7 @@ class Primitives< * so we need to check whether MultimemSrcs and MultimemDsts are 0. */ && MultimemSrcs == 0 && MultimemDsts == 0 && !Src) { // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy - if (Send) { + if (Send && Dst && ncclShmem.groups[group].srcs[0] != ncclShmem.groups[group].dsts[1]) { reduceCopy (tid, nworkers, /*redArg*/0, /*preOpArgs*/nullptr, /*postOp*/false, 1, ncclShmem.groups[group].srcs, @@ -269,16 +273,32 @@ class Primitives< } else if (ncclShmem.groups[group].srcs[0] && ncclShmem.groups[group].dsts[0]) { constexpr int PreOpSrcs = SrcBuf != Input ? 0 : DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1; - reduceCopy - (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, - Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs, - Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts, - workSize); + if (Send && Dst && ncclShmem.groups[group].dsts[1] == nullptr) { + // this case should only be directCopySend() with registered buffers and send to net peer + reduceCopy + (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, + Recv * fan.nrecv() + Src, ncclShmem.groups[group].srcs, + 1, ncclShmem.groups[group].dsts, + workSize); + } else { + reduceCopy + (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, + Recv * fan.nrecv() + Src, ncclShmem.groups[group].srcs, + Send * fan.nsend() + Dst, ncclShmem.groups[group].dsts, + workSize); + } + } else { + // we will come here when calling prims.directSend with net peer, + // in this case, ncclShmem.groups[group].dsts[0] == NULL, so we + // skip data flush. + workSize = 0; } barrier(); // This barrier has a counterpart in following loop - postPeer(0 < sliceSize); + postPeer(0 < workSize); offset += sliceSize; slice += 1; // Yes, for some template arguments this code will be unreachable. That's fine. @@ -295,10 +315,11 @@ class Primitives< sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset; { // Only workers could have Wait roles so we know the slice must be empty // since we've exited the loop above. - waitPeer(0, 0, 0, 0); + waitPeer(0, 0, 0, sliceSize); } barrier(); // Has couterpart in preceding worker-only loop. - postPeer(0 < sliceSize); + int workSize = ncclShmem.aborted ? 0 : sliceSize; + postPeer(0 < workSize); offset += sliceSize; slice += 1; } @@ -347,17 +368,17 @@ class Primitives< ptrs[index] = connEltsFifo + offset/sizeof(T); } else if (Direct && fn.work->regUsed) { if (isSendNotRecv) { - if (flags & (DirectWrite | IpcWrite)) { + if (flags & DirectWrite) { ptrs[index] = directBuff; - } else if (flags & (DirectRead | IpcRead)) { // empty send + } else if (flags & DirectRead) { // empty send ptrs[index] = nullptr; } else { ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize; } } else { - if (flags & (DirectRead | IpcRead)) { + if (flags & DirectRead) { ptrs[index] = directBuff; - } else if (flags & (DirectWrite | IpcWrite)) { + } else if (flags & DirectWrite) { if (Send) ptrs[index] = directBuff; // send to next from my output buffer else @@ -440,7 +461,7 @@ class Primitives< int i = (j+shift)%fan.nsend(); ssize_t pOffset = i*peerOffset; // Skip the data I am responsible of reducing myself - if (skip >= 0 && i >= skip) pOffset += peerElem; + if (skip >= 0 && i >= skip) pOffset += peerOffset; void* src0 = (T*)ncclShmem.groups[group].srcs[0] + pOffset; ssize_t realPeerSize = min(realSize, totalElem-pOffset); if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) { @@ -452,7 +473,7 @@ class Primitives< } else if (Recv) { if (tid==0) ncclShmem.groups[group].dsts[0] = (T*)ncclShmem.groups[group].userOutput + outIx + offset; ssize_t pOffset = index*peerOffset; - if (skip >= 0 && index >= skip) pOffset += peerElem; + if (skip >= 0 && index >= skip) pOffset += peerOffset; // Adjust remote index with peer offset in case we are directly pulling from peer's output buffer waitPeer(outIx+pOffset, outIx+pOffset, offset, realSize); subBarrier(); @@ -460,7 +481,7 @@ class Primitives< for (int j=0; j= 0 && i >= skip) pOffset += peerElem; + if (skip >= 0 && i >= skip) pOffset += peerOffset; void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset; ssize_t realPeerSize = min(realSize, totalElem-pOffset); if (DirectRecv && ncclShmem.groups[group].srcs[i] == dst0) realPeerSize = 0; @@ -474,7 +495,7 @@ class Primitives< } } - __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int regFlag) { + __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int ipcRegFlag, int netRegFlag) { conn = &peer->recv[connIndex]; if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) { // handle must be a device ptr @@ -499,33 +520,34 @@ class Primitives< if (conn->connFifo != nullptr) { flags |= ConnFifoEnabled; connFifo = conn->connFifo; - } else if (Direct && regFlag) { - // User buffers have been registered - if (conn->flags & (NCCL_IPC_READ | NCCL_IPC_WRITE)) { - if (P2p) { - flags |= conn->flags & NCCL_IPC_WRITE ? IpcWrite : IpcRead; - } else if (connIndex == 1 && direct) { - flags |= IpcRead; - } else { - flags |= direct & NCCL_DIRECT_READ ? IpcRead : IpcWrite; + } + if (Direct) { + if (ipcRegFlag) { + // User buffers have been registered + if (conn->flags & (NCCL_P2P_READ | NCCL_P2P_WRITE)) { + if (P2p) { + flags |= conn->flags & NCCL_P2P_WRITE ? DirectWrite : DirectRead; + } else if (connIndex == 1 && direct) { + flags |= DirectRead; + } else { + flags |= direct & NCCL_P2P_READ ? DirectRead : DirectWrite; + } + } else if ((conn->flags & NCCL_NVLS_MIN_POLL)) { + /* NVLS direct */ + flags |= DirectRead; } - } else if (conn->flags & (NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) { - if (P2p) { - flags |= conn->flags & NCCL_DIRECT_WRITE ? DirectWrite : DirectRead; - } else if (connIndex == 1 && direct) { - flags |= DirectRead; // scatter-reduce use direct pull - } else { - flags |= direct & NCCL_DIRECT_READ ? DirectRead : DirectWrite; + } + if (netRegFlag) { + if (conn->flags & NCCL_DIRECT_NIC) { + flags |= NetRegMode; + connFifo[step % NCCL_STEPS].size = 0; } - } else if ((conn->flags & NCCL_NVLS_MIN_POLL)) { - /* NVLS direct */ - flags |= NvlsDirectRead; } } } } - __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int regFlag) { + __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int ipcRegFlag, int netRegFlag) { conn = &peer->send[connIndex]; step = conn->step; step = roundUp(step, SlicePerChunk*StepPerSlice); @@ -544,27 +566,26 @@ class Primitives< connStepCache = loadStepValue(connStepPtr); connStepSize = conn->stepSize/sizeof(T); connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; - if (connFifo == nullptr && Direct && regFlag) { - // User buffers have been registered - if (conn->flags & (NCCL_IPC_READ | NCCL_IPC_WRITE)) { - if (P2p) { - flags |= conn->flags & NCCL_IPC_WRITE ? IpcWrite : IpcRead; - } else if (connIndex == 1 && direct) { - flags |= IpcRead; - } else { - flags |= direct & NCCL_DIRECT_READ ? IpcRead : IpcWrite; + if (Direct) { + if (ipcRegFlag) { + // User buffers have been registered + if (conn->flags & (NCCL_P2P_WRITE | NCCL_P2P_READ)) { + if (P2p) { + flags |= conn->flags & NCCL_P2P_WRITE ? DirectWrite : DirectRead; + } else if (connIndex == 1 && direct) { + flags |= DirectRead; // scatter-reduce use direct pull + } else { + flags |= direct & NCCL_P2P_READ ? DirectRead : DirectWrite; + } + } else if ((conn->flags & NCCL_NVLS_MIN_POLL)) { + /* NVLS direct */ + flags |= DirectWrite; } - } else if (conn->flags & (NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) { - if (P2p) { - flags |= conn->flags & NCCL_DIRECT_WRITE ? DirectWrite : DirectRead; - } else if (connIndex == 1 && direct) { - flags |= DirectRead; // scatter-reduce use direct pull - } else { - flags |= direct & NCCL_DIRECT_READ ? DirectRead : DirectWrite; + } + if (netRegFlag) { + if (conn->flags & NCCL_DIRECT_NIC) { + flags |= NetRegMode; } - } else if ((conn->flags & NCCL_NVLS_MIN_POLL)) { - /* NVLS direct */ - flags |= NvlsDirectWrite; } } } @@ -574,8 +595,8 @@ class Primitives< __device__ Primitives( int tid, int nthreads, int const *recvPeers, int const *sendPeers, void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0, - uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* e = nullptr, - bool ipcReg = false, bool netReg = false, int stepSize_ = 0, int mode = primsModeDefault + uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* collWork = nullptr, + struct ncclDevWorkP2p* p2pWork = nullptr, int stepSize_ = 0, int mode = primsModeDefault ): tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group), stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) { @@ -643,11 +664,23 @@ class Primitives< // Coverity thinks that index could be -1 here but that's not actually the case. // coverity[negative_returns:FALSE] - if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e ? e->direct : 0, e ? e->regUsed : ipcReg); - // coverity[negative_returns:FALSE] - if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e ? e->direct : 0, e ? e->regUsed : ipcReg); - - if (netReg) flags |= NetRegMode; + int sendIpcReg; + int recvIpcReg; + int sendNetReg; + int recvNetReg; + if (P2p) { + sendIpcReg = p2pWork ? p2pWork->sendIpcReg : 0; + recvIpcReg = p2pWork ? p2pWork->recvIpcReg : 0; + sendNetReg = p2pWork ? p2pWork->sendNetReg : 0; + recvNetReg = p2pWork ? p2pWork->recvNetReg : 0; + } else { + recvIpcReg = sendIpcReg = collWork ? collWork->regUsed : 0; + recvNetReg = sendNetReg = collWork ? collWork->netRegUsed : 0; + } + // coverity[overrun-call] => Coverity think prims.index can be greater than 1 + if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg); + // coverity[overrun-call] => Coverity think prims.index can be greater than 1 + if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg); if (barrierAny(flags & NetDeviceUnpack)) { flags |= AnyNetDeviceUnpack; @@ -659,8 +692,10 @@ class Primitives< } } - // coverity[negative_returns:FALSE] - setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)e, (uint8_t)(e ? e->regUsed : ipcReg), peer); + // coverity[negative_returns:FALSE] => coverity thinks that index could be -1 but that's not actually the case + // coverity[var_deref_model] => coverity thinks work can dereferenced if NULL but this is not the case + setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)collWork, sendIpcReg || recvIpcReg, peer); + // coverity[uninit_member] => coverity thinks fan.n is not initialized } __device__ ~Primitives() { @@ -683,6 +718,16 @@ class Primitives< // Make sure all threads are done writing back conn->step and done using // ncclShmem.groups[group] barrier(); + + if ((flags & DirectRead) && (flags & RoleWaitSend) && P2p) { + // For sendrecv DirectRead, sender needs to wait for receiver reading data from src. + // This has to be done after barrier() since post thread might have contention with + // this check. + int spins = 0; + volatile uint64_t* tail = conn->tail; + volatile uint64_t* head = conn->head; + while (*tail > *head) if (checkAbort(spins)) break; + } } __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclDevWorkCollReg* work, uint8_t ipcReg, int peer) { @@ -693,10 +738,10 @@ class Primitives< } if (Direct && ipcReg) { - bool recvProvider = (flags & RoleWaitRecv) && (flags & DirectWrite || flags & IpcWrite); - bool sendAcceptor = (flags & RoleWaitSend) && (flags & DirectWrite || flags & IpcWrite || flags & NvlsDirectWrite); - bool sendProvider = (flags & RoleWaitSend) && (flags & DirectRead || flags & IpcRead); // sender provides direct buffer (to be fetched) - bool recvAcceptor = (flags & RoleWaitRecv) && (flags & DirectRead || flags & IpcRead || flags & NvlsDirectRead); // receiver accepts direct buffer + bool recvProvider = (flags & RoleWaitRecv) && (flags & DirectWrite); + bool sendAcceptor = (flags & RoleWaitSend) && (flags & DirectWrite); + bool sendProvider = (flags & RoleWaitSend) && (flags & DirectRead); // sender provides direct buffer (to be fetched) + bool recvAcceptor = (flags & RoleWaitRecv) && (flags & DirectRead); // receiver accepts direct buffer if (recvProvider) { int spins = 0; void* volatile* slot = ncclShmem.groups[group].recvConns[index]->ptrExchange; @@ -709,6 +754,7 @@ class Primitives< exchgPtr = (T*)outputBuf; } else { int localPeer = ncclShmem.comm.rankToLocalRank[peer]; + // coverity[deref_parm:FALSE] => work cannot be NULL if ipcReg != NULL exchgPtr = (T*)(work->coll.recvbuffOffset + work->coll.recvbuffRmtAddrs[localPeer]); } *slot = reinterpret_cast(exchgPtr); @@ -727,6 +773,7 @@ class Primitives< directBuff = reinterpret_cast(ptr); *slot = nullptr; } else { + // coverity[var_deref_op] directBuff = (T*)work->dnOutputs[index]; } } @@ -747,8 +794,10 @@ class Primitives< } else { int localPeer = ncclShmem.comm.rankToLocalRank[peer]; if (MaxRecv == 0) + // coverity[var_deref_op] exchgPtr = (T*)(work->coll.sendbuffOffset + work->coll.sendbuffRmtAddrs[localPeer]); else + // coverity[var_deref_op] exchgPtr = (T*)(work->coll.recvbuffOffset + work->coll.recvbuffRmtAddrs[localPeer]); } @@ -837,11 +886,11 @@ class Primitives< __device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, eltN, postOp); } - __device__ __forceinline__ void directRecvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) { - genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp); + __device__ __forceinline__ void directRecvCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { + genericOp<1, 1, 1, 1, -1, Output>(inpIx, outIx, eltN, postOp); } - __device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN) { - genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, false); + __device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { + genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, postOp); } __device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp); @@ -860,6 +909,9 @@ class Primitives< __device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) { genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp); } + __device__ __forceinline__ void recvReduceDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { + genericOp<0, 1, 1, 1, Input, -1>(inpIx, outIx, eltN, postOp); + } __device__ __forceinline__ void directRecvReduceDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) { genericOp<1, 1, 1, 1, Input, -1>(inpIx, outIx, eltN, postOp); } diff --git a/src/device/reduce_kernel.h b/src/device/reduce_kernel.h index b069c07ec..c2378e3df 100644 --- a/src/device/reduce_kernel.h +++ b/src/device/reduce_kernel.h @@ -20,6 +20,12 @@ struct IsFloatingPoint: std::true_type {}; template<> struct IsFloatingPoint<__nv_bfloat16>: std::true_type {}; #endif +#if defined(__CUDA_FP8_TYPES_EXIST__) +template<> +struct IsFloatingPoint<__nv_fp8_e4m3>: std::true_type {}; +template<> +struct IsFloatingPoint<__nv_fp8_e5m2>: std::true_type {}; +#endif template<> struct IsFloatingPoint: std::true_type {}; template<> @@ -298,6 +304,24 @@ SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : f #endif #endif +#if defined(__CUDA_FP8_TYPES_EXIST__) +#if __CUDA_ARCH__ >= 900 + SPECIALIZE_REDUCE(FuncSum, __nv_fp8_e4m3, 1, __nv_fp8_e4m3, __nv_fp8_e4m3(__hadd(__half(x),__half(y)))) + SPECIALIZE_REDUCE(FuncSum, __nv_fp8_e4m3, 2, __nv_fp8x2_e4m3, __nv_fp8x2_e4m3(__hadd2(__half2(x),__half2(y)))) + SPECIALIZE_REDUCE(FuncProd, __nv_fp8_e4m3, 1, __nv_fp8_e4m3, __nv_fp8_e4m3(__hmul(__half(x),__half(y)))) + SPECIALIZE_REDUCE(FuncProd, __nv_fp8_e4m3, 2, __nv_fp8x2_e4m3, __nv_fp8x2_e4m3(__hmul2(__half2(x),__half2(y)))) + SPECIALIZE_REDUCE(FuncMinMax, __nv_fp8_e4m3, 1, __nv_fp8_e4m3, __nv_fp8_e4m3(fn.isMinNotMax ? __hmin(__half(x),__half(y)) : __hmax(__half(x),__half(y)))) + SPECIALIZE_REDUCE(FuncMinMax, __nv_fp8_e4m3, 2, __nv_fp8x2_e4m3, __nv_fp8x2_e4m3(fn.isMinNotMax ? __hmin2(__half2(x),__half2(y)) : __hmax2(__half2(x),__half2(y)))) + + SPECIALIZE_REDUCE(FuncSum, __nv_fp8_e5m2, 1, __nv_fp8_e5m2, __nv_fp8_e5m2(__hadd(__half(x),__half(y)))) + SPECIALIZE_REDUCE(FuncSum, __nv_fp8_e5m2, 2, __nv_fp8x2_e5m2, __nv_fp8x2_e5m2(__hadd2(__half2(x),__half2(y)))) + SPECIALIZE_REDUCE(FuncProd, __nv_fp8_e5m2, 1, __nv_fp8_e5m2, __nv_fp8_e5m2(__hmul(__half(x),__half(y)))) + SPECIALIZE_REDUCE(FuncProd, __nv_fp8_e5m2, 2, __nv_fp8x2_e5m2, __nv_fp8x2_e5m2(__hmul2(__half2(x),__half2(y)))) + SPECIALIZE_REDUCE(FuncMinMax, __nv_fp8_e5m2, 1, __nv_fp8_e5m2, __nv_fp8_e5m2(fn.isMinNotMax ? __hmin(__half(x), __half(y)) : __hmax(__half(x), __half(y)))) + SPECIALIZE_REDUCE(FuncMinMax, __nv_fp8_e5m2, 2, __nv_fp8x2_e5m2, __nv_fp8x2_e5m2(fn.isMinNotMax ? __hmin2(__half2(x), __half2(y)) : __hmax2(__half2(x), __half2(y)))) +#endif +#endif + #undef SPECIALIZE_REDUCE //////////////////////////////////////////////////////////////////////////////// @@ -416,9 +440,9 @@ template<> struct FuncPreMulSum { using EltType = half; #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610 - half2 scalar; + __half2 scalar; __device__ FuncPreMulSum(uint64_t opArg=0) { - union { uint64_t u64; half val; }; + union { uint64_t u64; __half val; }; u64 = opArg; scalar.x = val; scalar.y = val; @@ -426,9 +450,9 @@ struct FuncPreMulSum { #else float scalar; __device__ FuncPreMulSum(uint64_t opArg=0) { - union { uint64_t u64; half val; }; + union { uint64_t u64; __half val; }; u64 = opArg; - scalar = __half2float(val); + scalar = (float)val; } #endif }; @@ -459,11 +483,39 @@ struct FuncPreMulSum { }; #endif -template -struct Apply_Reduce, /*EltPerPack=*/1> { - __device__ static BytePack reduce(FuncPreMulSum fn, BytePack a, BytePack b) { +#if defined(__CUDA_FP8_TYPES_EXIST__) +#if __CUDA_ARCH__ >= 900 + template<> + struct FuncPreMulSum<__nv_fp8_e4m3> { + using EltType = __nv_fp8_e4m3; + __half2 scalar2; + __device__ FuncPreMulSum(uint64_t opArg) { + union { uint64_t u64; __nv_fp8_storage_t val; }; + u64 = opArg; + scalar2.x = __half(__nv_cvt_fp8_to_halfraw(val, __NV_E4M3)); + scalar2.y = scalar2.x; + } + }; + + template<> + struct FuncPreMulSum<__nv_fp8_e5m2> { + using EltType = __nv_fp8_e5m2; + __half2 scalar2; + __device__ FuncPreMulSum(uint64_t opArg) { + union { uint64_t u64; __nv_fp8_storage_t val; }; + u64 = opArg; + scalar2.x = __half(__nv_cvt_fp8_to_halfraw(val, __NV_E5M2)); + scalar2.y = scalar2.x; + } + }; +#endif +#endif + +template +struct Apply_Reduce, EltPerPack> { + __device__ static BytePack reduce(FuncPreMulSum fn, BytePack a, BytePack b) { // FuncPreMulSum reduce dispatches to FuncSum. - return Apply_Reduce, 1>::reduce(FuncSum(), a, b); + return Apply_Reduce, EltPerPack>::reduce(FuncSum(), a, b); } }; @@ -530,6 +582,51 @@ struct Apply_PreOp, /*EltPerPack=*/1> { #endif #endif +//////////////////////////////////////////////////////////////////////////////// +// Apply_PreOp of FuncPreMulSum for fp8. + +#if defined(__CUDA_FP8_TYPES_EXIST__) +#if __CUDA_ARCH__ >= 900 + template<> + struct Apply_PreOp, /*EltPerPack=*/1> { + static constexpr bool IsIdentity = false; + __device__ static BytePack preOp( + FuncPreMulSum<__nv_fp8_e4m3> fn, BytePack a + ) { + return toPack<__nv_fp8_e4m3>(__nv_fp8_e4m3(__hmul(__half(fromPack<__nv_fp8_e4m3>(a)), fn.scalar2.x))); + } + }; + template<> + struct Apply_PreOp, /*EltPerPack=*/2> { + static constexpr bool IsIdentity = false; + __device__ static BytePack preOp( + FuncPreMulSum<__nv_fp8_e4m3> fn, BytePack a + ) { + return toPack<__nv_fp8x2_e4m3>(__nv_fp8x2_e4m3(__hmul2(__half2(fromPack<__nv_fp8x2_e4m3>(a)), fn.scalar2))); + } + }; + + template<> + struct Apply_PreOp, /*EltPerPack=*/1> { + static constexpr bool IsIdentity = false; + __device__ static BytePack preOp( + FuncPreMulSum<__nv_fp8_e5m2> fn, BytePack a + ) { + return toPack<__nv_fp8_e5m2>(__nv_fp8_e5m2(__hmul(__half(fromPack<__nv_fp8_e5m2>(a)), fn.scalar2.x))); + } + }; + template<> + struct Apply_PreOp, /*EltPerPack=*/2> { + static constexpr bool IsIdentity = false; + __device__ static BytePack preOp( + FuncPreMulSum<__nv_fp8_e5m2> fn, BytePack a + ) { + return toPack<__nv_fp8x2_e5m2>(__nv_fp8x2_e5m2(__hmul2(__half2(fromPack<__nv_fp8x2_e5m2>(a)), fn.scalar2))); + } + }; +#endif +#endif + //////////////////////////////////////////////////////////////////////////////// // FuncSumPostDiv @@ -541,34 +638,44 @@ struct RedOpArg> { } }; -template::value> -struct FuncSumPostDiv_IntOnly; - -template -struct FuncSumPostDiv: FuncSumPostDiv_IntOnly { - __device__ FuncSumPostDiv(uint64_t opArg=0): - FuncSumPostDiv_IntOnly(opArg) { - } -}; - template -struct FuncSumPostDiv_IntOnly: FuncSum { +struct FuncSumPostDiv { + static_assert(T(0) < T(-1), "FuncSumPostDiv is only for implementing ncclAvg on uint types."); using EltType = T; - int divisor; - __device__ FuncSumPostDiv_IntOnly(uint64_t opArg=0): divisor(opArg) {} -}; - -template -struct FuncSumPostDiv_IntOnly { - static_assert(sizeof(T)!=sizeof(T), "FuncSumPostDiv is only for implementing ncclAvg on integral types."); + using UintType = typename std::conditional::type; + uint32_t divisor:31, isSigned:1; + UintType recip; + + __device__ FuncSumPostDiv(uint64_t opArg=0) { + isSigned = opArg & 1; + divisor = opArg >> 1; + recip = UintType(-1)/divisor; + } + __device__ T divide(T x) { + // x is negative iff we are in signed mode and the top bit is set + bool xneg = isSigned && (x & ~(T(-1)>>1)); + // Compute abs(x): + // T(-x) vs -T(x) is critical. We have to negate then truncate the bits. Consider + // if we are doing signed 8-bit types, thus T=uint8_t. The value -1 is encoded + // as 0xff. -T(0xff) when promoted to 32-bit (which is implicit by compiler) + // gives 0xffffff01, but T(-0xff) is 0x1, and that is the abs value we want. + UintType xabs = xneg ? T(-x) : x; + // Compute quotient by multiplying by reciprical. + UintType q = sizeof(T)==8 ? __umul64hi(xabs, recip) : __umulhi(xabs, recip); + // Quotient may be off by one so do a fixup. + if (xabs - q*divisor >= divisor) q += 1; + // If original x was negative then we have to negate it back since we were + // working with its abs val. + return xneg ? -T(q) : T(q); + } }; -template -struct Apply_Reduce, /*EltPerPack=*/1>: - Apply_Reduce, 1> { - __device__ static BytePack reduce(FuncSumPostDiv fn, BytePack a, BytePack b) { +template +struct Apply_Reduce, EltPerPack>: + Apply_Reduce, EltPerPack> { + __device__ static BytePack reduce(FuncSumPostDiv fn, BytePack a, BytePack b) { // FuncSumPostDiv reduce dispatches to FuncSum. - return Apply_Reduce, 1>::reduce(FuncSum(), a, b); + return Apply_Reduce, EltPerPack>::reduce(FuncSum(), a, b); } }; @@ -576,7 +683,7 @@ template struct Apply_PostOp, /*EltPerPack=*/1> { static constexpr bool IsIdentity = false; __device__ static BytePack postOp(FuncSumPostDiv fn, BytePack a) { - return toPack(fromPack(a) / fn.divisor); + return toPack(fn.divide(fromPack(a))); } }; diff --git a/src/device/reduce_scatter.h b/src/device/reduce_scatter.h index f7b3c25e5..70538b117 100644 --- a/src/device/reduce_scatter.h +++ b/src/device/reduce_scatter.h @@ -89,7 +89,7 @@ struct RunWorkCollsendbuff; T *outputBuf = (T*)work->recvbuff; Primitives, 0, Proto, 0> prims - (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, false, false, 0, primsModePatRs); + (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatRs); PatRSAlgorithm patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks); int last = 0; @@ -137,6 +137,7 @@ struct RunWorkCollnHeads * count, nelem, count, -1, 0); } + // coverity[overrun-call] => Coverity think prims.index can be greater than 1 } else if (tid < tidEndReduce) { // Reduce through NVLS using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>; @@ -206,10 +207,10 @@ struct RunWorkCollnHeads; int part = ncclShmem.channelId - work->channelLo; void* inbuf = (void*)work->sendbuff; - ssize_t sizePerRank = work->collnet.count; + ssize_t countPerRank = work->collnet.count; - ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*sizePerRank); - ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank); + ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*countPerRank); + ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*countPerRank); int railAllSize = railAllEnd - railAllBeg; if (tid < nDsts) dstSizes[tid] = railAllSize; @@ -222,15 +223,15 @@ struct RunWorkCollredOpArg, &work->redOpArg, false, /*nSrcs=*/1+nSrcs, [=]__device__(int s) { return s==0 ? (T*)inbuf + userOneBeg - : work->regUsed && (recvDirectFlag & NCCL_DIRECT_READ) + : work->regUsed && (recvDirectFlag & NCCL_P2P_READ) ? (T*)srcPtrs[s-1] + userOneBeg : (T*)srcPtrs[s-1] + railAllOffset; }, @@ -264,7 +265,8 @@ struct RunWorkCollcollnet.chunkCount); - ssize_t sizePerRank = work->collnet.count; + ssize_t countPerRank = work->collnet.count; + const int hasDn = (direct->down[0] >= 0) ? 1 : 0; if (direct->out == -1) __trap(); bool isMultiRail = (direct->nHeads > 1); @@ -281,15 +283,15 @@ struct RunWorkColl, /*Direct=*/1, Proto, 0> + Primitives, /*Direct=*/0, Proto, 0> prims(tid, tn, nullptr, direct->heads+1, work->sendbuff, nullptr, - work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1, work); - for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) { + work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1); + for (ssize_t railGridOffset=0; railGridOffset < nNodes*countPerRank; railGridOffset += nChannels*chunkSize) { Scatterer scat; scat.work = work; scat.chunkSize = chunkSize; scat.railGridOffset = railGridOffset; - prims.template process(scat, NCCL_DIRECT_READ, 0); + prims.template process(scat, 0, 0); } return; } @@ -297,23 +299,22 @@ struct RunWorkCollregUsed == NCCL_COLLNET_REG_BUFFER) { + if (work->netRegUsed && !hasDn) { if (tid == 0) { - int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE); - Primitives, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps); + Primitives, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, 1); } __syncwarp(); } else { // Phase 2: Reduce from peers + local input -> send to network - Primitives, /*Direct=*/1, Proto, 0> + Primitives, /*Direct=*/0, Proto, 0> prims(tid, tn, direct->heads + 1, &direct->out, nullptr, nullptr, - work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1, work); - for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) { + work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1); + for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkSize) { Scatterer scat; scat.work = work; scat.chunkSize = chunkSize; scat.railGridOffset = railGridOffset; - prims.template process(scat, 0, NCCL_DIRECT_READ); + prims.template process(scat, 0, 0); } } return; @@ -322,9 +323,9 @@ struct RunWorkCollregUsed == NCCL_COLLNET_REG_BUFFER) { + if (work->netRegUsed) { if (tid == 0) { - int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE); + int steps = hasDn ? (int)divUp(nNodes * countPerRank, nChannels * chunkSize) : 1; Primitives, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps); } __syncwarp(); @@ -333,11 +334,11 @@ struct RunWorkColl, /*Direct=*/0, Proto, 0> prims(tid, tn, &direct->out, nullptr, nullptr, work->recvbuff, work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0); - for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) { + for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkSize) { ssize_t railAllBeg = railGridOffset + part * chunkSize; - ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank); - ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank; - ssize_t railOneEnd = railOneBeg + sizePerRank; + ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * countPerRank); + ssize_t railOneBeg = ncclShmem.comm.node * countPerRank; + ssize_t railOneEnd = railOneBeg + countPerRank; ssize_t beg = max(railAllBeg, railOneBeg); ssize_t end = min(railAllEnd, railOneEnd); prims.recv(beg - railOneBeg, max(ssize_t(0), end - beg), /*postOp=*/true); diff --git a/src/device/sendrecv.h b/src/device/sendrecv.h index 9b039a41a..fe3b9ca77 100644 --- a/src/device/sendrecv.h +++ b/src/device/sendrecv.h @@ -15,33 +15,35 @@ struct RunWorkBatch __device__ void runSend(int tid, int tn, int group, struct ncclDevWorkP2p* work) { size_t bytes = work->sendBytes; - int chunkSize = work->sendIpcReg && ncclShmem.comm.isNvlink ? (1 << 30) : u32fp8Decode(work->sendChunkSize_u32fp8); + bool useLargeChunk = (work->sendIpcReg && ncclShmem.comm.isAllNvlink) || work->sendNetReg; + int chunkSize = useLargeChunk ? NCCL_MAX_NET_SIZE : u32fp8Decode(work->sendChunkSize_u32fp8); + int stepSize = useLargeChunk ? NCCL_MAX_NET_SIZE : ncclShmem.comm.p2pChunkSize; Primitives, 1, Proto, 1> prims(tid, tn, nullptr, &work->sendRank, work->sendAddr, nullptr, - /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, - /*ipcReg=*/work->sendIpcReg, /*netReg=*/work->sendRegistered, ncclShmem.comm.p2pChunkSize); + /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, work, stepSize); size_t cursor = 0; do { int n = min(size_t(chunkSize), bytes-cursor); prims.directSend(cursor, cursor, n); cursor += n; - } while (cursor < bytes && work->sendRegistered == 0); + } while (cursor < bytes); } template __device__ void runRecv(int tid, int tn, int group, struct ncclDevWorkP2p* work) { size_t bytes = work->recvBytes; - int chunkSize = work->recvIpcReg && ncclShmem.comm.isNvlink ? (1 << 30) : u32fp8Decode(work->recvChunkSize_u32fp8); + bool useLargeChunk = (work->recvIpcReg && ncclShmem.comm.isAllNvlink) || work->recvNetReg; + int chunkSize = useLargeChunk ? NCCL_MAX_NET_SIZE : u32fp8Decode(work->recvChunkSize_u32fp8); + int stepSize = useLargeChunk ? NCCL_MAX_NET_SIZE : ncclShmem.comm.p2pChunkSize; Primitives, 1, Proto, 1> prims(tid, tn, &work->recvRank, nullptr, nullptr, work->recvAddr, - /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, - /*ipcReg=*/work->recvIpcReg, /*netReg=*/work->recvRegistered, ncclShmem.comm.p2pChunkSize); + /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, work, stepSize); size_t cursor = 0; do { int n = min(size_t(chunkSize), bytes-cursor); prims.directRecv(cursor, cursor, n); cursor += n; - } while (cursor < bytes && work->recvRegistered == 0); + } while (cursor < bytes); } __device__ __forceinline__ void run() { diff --git a/src/enqueue.cc b/src/enqueue.cc index 4edb42dec..285e17f69 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -16,6 +16,7 @@ #include // std::memcpy #include // PRIx64 +#include NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0); @@ -63,15 +64,6 @@ static inline int ncclFuncTrafficPerByte(ncclFunc_t func, int nRanks) { default: return 1; } } -static inline size_t ncclFuncSendCount(ncclFunc_t func, int nRanks, size_t count) { - return func == ncclFuncReduceScatter ? nRanks*count : count; -} -static inline size_t ncclFuncRecvCount(ncclFunc_t func, int nRanks, size_t count) { - return func == ncclFuncAllGather ? nRanks*count : count; -} -static inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count) { - return func == ncclFuncAllGather || func == ncclFuncReduceScatter ? nRanks*count : count; -} /*****************************************************************************/ /* Launch system : synchronization and CUDA kernel launch */ @@ -230,301 +222,8 @@ static void finishPlan(struct ncclComm* comm, struct ncclKernelPlan* plan) { } } -int64_t ncclParamLocalRegister(); NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 1); -struct ncclIpcCleanupCallback { - struct ncclCommCallback base; - void* ptr; -}; -static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* cb) { - struct ncclIpcCleanupCallback* me = (struct ncclIpcCleanupCallback*)cb; - CUDACHECKIGNORE(cudaIpcCloseMemHandle(me->ptr)); - free(me); - return ncclSuccess; -} - -static ncclResult_t registerCheckP2PConnection(struct ncclComm* comm, struct ncclConnector* conn, struct ncclTopoGraph* graph, int peer, bool* needReg) { - if (conn->connected) { - if (conn->conn.flags & (NCCL_IPC_READ | NCCL_IPC_WRITE | NCCL_DIRECT_READ | NCCL_DIRECT_WRITE)) { - *needReg = true; - } else { - // network connection - *needReg = false; - } - } else { - struct ncclPeerInfo* peerInfo = &comm->peerInfo[peer]; - struct ncclPeerInfo* myInfo = &comm->peerInfo[comm->rank]; - int canConnect = 0; - NCCLCHECK(ncclTransports[0]->canConnect(&canConnect, comm, graph, myInfo, peerInfo)); - if (canConnect) { - *needReg = true; - } else { - *needReg = false; - } - } - return ncclSuccess; -} - -static ncclResult_t registerCollBuffers( - struct ncclComm* comm, struct ncclTaskColl* info, - void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], - void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], - struct ncclIntruQueue* cleanupQueue, - bool* regNeedConnect - ) { - ncclResult_t result = ncclSuccess; - - info->regBufType = NCCL_REGULAR_BUFFER; - *regNeedConnect = true; - if (!(ncclParamLocalRegister() || (comm->planner.persistent && ncclParamGraphRegister()))) goto exit; -#if CUDART_VERSION >= 11030 - if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) { - if (!comm->nvlsRegSupport || info->opDev.op == ncclDevPreMulSum) goto exit; - bool regBufUsed = false; - const void *sendbuff = info->sendbuff; - void *recvbuff = info->recvbuff; - if (info->func == ncclFuncAllGather) sendbuff = NULL; - if (info->func == ncclFuncReduceScatter) recvbuff = NULL; - size_t elementSize = ncclTypeSize(info->datatype); - size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count); - size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count); - - /* first try local registration. */ - if (ncclParamLocalRegister()) { - ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, ®BufUsed, outRegBufSend, outRegBufRecv); - } - - if (regBufUsed == false && comm->planner.persistent && ncclParamGraphRegister()) { - ncclNvlsGraphRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, ®BufUsed, outRegBufSend, outRegBufRecv, cleanupQueue, &info->nCleanupQueueElts); - } - - if (regBufUsed) { - *regNeedConnect = false; - /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to - * saturate bandwidth. */ - if (comm->nNodes == 1) { - if (info->func == ncclFuncReduceScatter) - info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5)); - else - info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4)); - } else { - info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6)); - } - info->regBufType = NCCL_NVLS_REG_BUFFER; - } - } else if ((info->algorithm == NCCL_ALGO_COLLNET_DIRECT || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && comm->collNetRegSupport && info->opDev.op != ncclDevPreMulSum && info->opDev.op != ncclDevSumPostDiv) { - size_t elementSize = ncclTypeSize(info->datatype); - size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count); - size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count); - int sendRegBufFlag = 0; - int recvRegBufFlag = 0; - void *sendHandle, *recvHandle; - - if (ncclParamLocalRegister()) { - ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle); - info->sendMhandle = sendHandle; - if (sendRegBufFlag) { - ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle); - info->recvMhandle = recvHandle; - } - } - - if ((sendRegBufFlag == 0 || recvRegBufFlag == 0) && comm->planner.persistent && ncclParamGraphRegister()) { - if (!sendRegBufFlag) { - ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts); - info->sendMhandle = sendHandle; - } - if (sendRegBufFlag && !recvRegBufFlag) { - ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle, cleanupQueue, &info->nCleanupQueueElts); - info->recvMhandle = recvHandle; - } - } - - if (sendRegBufFlag && recvRegBufFlag) { - info->nMaxChannels = 1; - info->regBufType = NCCL_COLLNET_REG_BUFFER; - if (sendRegBufFlag == 1 && recvRegBufFlag == 1) { - INFO(NCCL_REG, "rank %d successfully registered collNet sendbuff %p (handle %p), sendbuff size %ld, recvbuff %p (handle %p), recvbuff size %ld", comm->rank, info->sendbuff, sendHandle, sendbuffSize, info->recvbuff, recvHandle, recvbuffSize); - } - } - } else if (comm->intraNodeP2pSupport && info->protocol == NCCL_PROTO_SIMPLE) { - // IPC buffer registration - if (info->func == ncclFuncReduceScatter) goto exit; - if (info->algorithm == NCCL_ALGO_RING && ((info->func == ncclFuncAllReduce && info->sendbuff == info->recvbuff) || info->func == ncclFuncReduce)) goto exit; - if ((info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && info->sendbuff == info->recvbuff) goto exit; - if (info->func == ncclFuncAllGather && info->algorithm == NCCL_ALGO_PAT) goto exit; - - int peerRanks[NCCL_MAX_LOCAL_RANKS]; - int nPeers = 0; - size_t elementSize = ncclTypeSize(info->datatype); - size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count); - size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count); - int regBufFlag = 0; - memset(peerRanks, 0xff, sizeof(int) * NCCL_MAX_LOCAL_RANKS); - - if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) { - struct ncclChannel* channel = comm->channels; - for (int r = 0; r < NCCL_MAX_DIRECT_ARITY; ++r) { - for (int updown = 0; updown < 2; ++updown) { - int peer; - if (updown == 0) - peer = channel->collnetDirect.up[r]; - else - peer = channel->collnetDirect.down[r]; - if (peer != -1) { - struct ncclConnector* peerConn = &channel->peers[peer]->recv[0]; - bool needReg = false; - - NCCLCHECK(registerCheckP2PConnection(comm, peerConn, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], peer, &needReg)); - if (needReg) { - bool found = false; - for (int p = 0; p < nPeers; ++p) { - if (peerRanks[p] == peer) { - found = true; - break; - } - } - if (!found) peerRanks[nPeers++] = peer; - } - } - } - } - - if (nPeers > 0) { - if (ncclParamLocalRegister()) - ncclIpcLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs); - if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) { - ncclIpcGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts); - } - if (regBufFlag) { - if (ncclParamLocalRegister()) - ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs); - if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) { - ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts); - } - } - } - if (regBufFlag) { - info->regBufType = NCCL_IPC_REG_BUFFER; - } - } else if (info->algorithm == NCCL_ALGO_RING) { - struct ncclReg* recvRegRecord; - NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord)); - if (recvRegRecord == NULL) goto exit; - for (int c = 0; c < comm->nChannels; ++c) { - struct ncclChannel* channel = comm->channels + c; - for (int r = 0; r < 2; ++r) { - bool needReg = false; - int peer; - struct ncclConnector* peerConn; - // P2P transport - if (r == 0) - peer = channel->ring.prev; - else - peer = channel->ring.next; - peerConn = &channel->peers[peer]->recv[0]; - NCCLCHECK(registerCheckP2PConnection(comm, peerConn, &comm->graphs[NCCL_ALGO_RING], peer, &needReg)); - - if (needReg) { - bool found = false; - for (int p = 0; p < nPeers; ++p) { - if (peerRanks[p] == peer) { - found = true; - break; - } - } - if (!found) peerRanks[nPeers++] = peer; - } - } - } - if (nPeers > 0) { - if (ncclParamLocalRegister()) { - ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs); - } - if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) { - ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts); - } - } - if (regBufFlag) { - info->regBufType = NCCL_IPC_REG_BUFFER; - } - } else if (info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) { - struct ncclReg* recvRegRecord; - NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord)); - if (recvRegRecord == NULL) goto exit; - for (int c = 0; c < comm->nChannels; ++c) { - struct ncclChannel* channel = comm->channels + c; - struct ncclTree* tree = NULL; - int peers[NCCL_MAX_TREE_ARITY + 1]; - - if (info->algorithm == NCCL_ALGO_TREE) - tree = &channel->tree; - else - tree = &channel->collnetChain; - for (int p = 0; p < NCCL_MAX_TREE_ARITY; ++p) peers[p] = tree->down[p]; - peers[NCCL_MAX_TREE_ARITY] = tree->up; - for (int p = 0; p < NCCL_MAX_TREE_ARITY + 1; ++p) { - int peer = peers[p]; - bool peerNeedReg = false; - struct ncclConnector* recvConn = NULL; - // P2P transport - if (peer == -1 || peer == comm->nRanks) continue; - recvConn = &channel->peers[peer]->recv[0]; - NCCLCHECK(registerCheckP2PConnection(comm, recvConn, &comm->graphs[info->algorithm], peer, &peerNeedReg)); - - if (peerNeedReg) { - bool found = false; - for (int pindex = 0; pindex < nPeers; ++pindex) { - if (peerRanks[pindex] == peer) { - found = true; - break; - } - } - if (!found) peerRanks[nPeers++] = peer; - } - } - } - if (nPeers > 0) { - if (ncclParamLocalRegister()) { - ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs); - } - if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) { - ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts); - } - } - if (regBufFlag) { - info->regBufType = NCCL_IPC_REG_BUFFER; - } - } - - if (info->regBufType == NCCL_IPC_REG_BUFFER && comm->nNodes == 1 && 16 < info->nMaxChannels && info->nMaxChannels <= 24) { - info->nMaxChannels = 16; - } - } -exit: -#endif - return result; -} - -static ncclResult_t registerP2pBuffer(struct ncclComm* comm, void* userbuff, int peerRank, size_t size, int* regFlag, void** regAddr, struct ncclIntruQueue* cleanupQueue) { - ncclResult_t ret = ncclSuccess; - uintptr_t offset = 0; - uintptr_t* peerRmtAddrs = NULL; - - *regFlag = 0; - if (ncclParamLocalRegister()) { - ncclIpcLocalRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs); - } - if (*regFlag == 0 && comm->planner.persistent && ncclParamGraphRegister()) { - ncclIpcGraphRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs, reinterpret_cast(cleanupQueue), NULL); - } - - if (*regFlag) - *regAddr = (void*)((uintptr_t)peerRmtAddrs + offset); - return ret; -} - static ncclResult_t getCollNetSupport(struct ncclComm* comm, struct ncclTaskColl* task, int* collNetSupport); static ncclResult_t getAlgoInfo( struct ncclComm* comm, struct ncclTaskColl* task, @@ -550,10 +249,72 @@ static bool testBudget( return ok; } +ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) { + struct ncclKernelPlanner* planner = &comm->planner; + struct ncclTaskColl *task; + + task = ncclIntruQueueHead(&planner->collTaskQueue); + while (task != nullptr) { + // Build a ncclDevWorkColl[Reg?] struct for each task. + void* regBufSend[NCCL_MAX_LOCAL_RANKS]; + void* regBufRecv[NCCL_MAX_LOCAL_RANKS]; + bool regNeedConnect = true; + struct ncclWorkList* workNode = NULL; + struct ncclDevWorkColl devWork = {}; + + if (task->algorithm == NCCL_ALGO_NVLS_TREE || task->algorithm == NCCL_ALGO_NVLS) { + workNode = ncclIntruQueueDequeue(&planner->tmpCollWorkQueue); + goto next; + } + ncclRegisterCollBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, ®NeedConnect); + + devWork.sendbuff = (void*)task->sendbuff; + devWork.recvbuff = (void*)task->recvbuff; + devWork.sendbuffOffset = task->sendbuffOffset; + devWork.recvbuffOffset = task->recvbuffOffset; + devWork.sendbuffRmtAddrs = task->sendbuffRmtAddrs; + devWork.recvbuffRmtAddrs = task->recvbuffRmtAddrs; + devWork.root = task->root; + devWork.nWarps = task->nWarps; + devWork.redOpArg = task->opDev.scalarArg; + devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr; + devWork.oneNode = (comm->nNodes == 1); + devWork.isOneRPN = comm->isOneRPN; + devWork.netRegUsed = devWork.regUsed = 0; + if (task->regBufType & NCCL_NET_REG_BUFFER) + devWork.netRegUsed = 1; + if (task->regBufType & (NCCL_IPC_REG_BUFFER | NCCL_NVLS_REG_BUFFER)) + devWork.regUsed = 1; + + if (task->regBufType & NCCL_NVLS_REG_BUFFER) { + struct ncclDevWorkCollReg workReg = {}; + workReg.coll = devWork; // C++ struct assignment + /* NVLS only has one send and recv buffer registered */ + workReg.dnInputs[0] = regBufSend[0]; + workReg.dnOutputs[0] = regBufRecv[0]; + workNode = ncclMemoryStackAllocInlineArray(&comm->memScoped, 1); + workNode->workType = ncclDevWorkTypeCollReg; + workNode->size = sizeof(struct ncclDevWorkCollReg); + memcpy((void*)(workNode+1), (void*)&workReg, workNode->size); + } else { + workNode = ncclMemoryStackAllocInlineArray(&comm->memScoped, 1); + workNode->workType = ncclDevWorkTypeColl; + workNode->size = sizeof(struct ncclDevWorkColl); + memcpy((void*)(workNode+1), (void*)&devWork, workNode->size); + } +next: + ncclIntruQueueEnqueue(&planner->collWorkQueue, workNode); + task = task->next; + } + assert(ncclIntruQueueEmpty(&planner->tmpCollWorkQueue)); + return ncclSuccess; +} + // Called once per ncclGroup to organize the user submitted tasks in // comm->planner so that they can be peeled off into plans. ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo) { struct ncclKernelPlanner* planner = &comm->planner; + planner->persistent = ncclCudaGraphValid(planner->capturingGraph); // Tasks from the sorter come out ordered size descending. struct ncclTaskColl* task = ncclTaskCollSorterDequeueAll(&planner->collSorter); // Tasks are assembled by (fn,op,ty) size ascending. @@ -648,7 +409,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool void* regBufSend[NCCL_MAX_LOCAL_RANKS]; void* regBufRecv[NCCL_MAX_LOCAL_RANKS]; bool regNeedConnect = true; - registerCollBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, ®NeedConnect); + ncclRegisterCollNvlsBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, ®NeedConnect); if (comm->runtimeConn && comm->initAlgoChannels[task->algorithm] == false) { if (task->algorithm == NCCL_ALGO_NVLS_TREE && comm->initAlgoChannels[NCCL_ALGO_NVLS] == false && regNeedConnect == true) { @@ -662,32 +423,28 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool } } - struct ncclDevWorkColl devWork = {}; - devWork.sendbuff = (void*)task->sendbuff; - devWork.recvbuff = (void*)task->recvbuff; - devWork.sendbuffOffset = task->sendbuffOffset; - devWork.recvbuffOffset = task->recvbuffOffset; - devWork.sendbuffRmtAddrs = task->sendbuffRmtAddrs; - devWork.recvbuffRmtAddrs = task->recvbuffRmtAddrs; - devWork.root = task->root; - devWork.nWarps = task->nWarps; - devWork.redOpArg = task->opDev.scalarArg; - devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr; - devWork.oneNode = (comm->nNodes == 1); - devWork.regUsed = task->regBufType; - - struct ncclWorkList* workNode; - switch (task->regBufType) { - case NCCL_REGULAR_BUFFER: - case NCCL_IPC_REG_BUFFER: - case NCCL_COLLNET_REG_BUFFER: - { workNode = ncclMemoryStackAllocInlineArray(&comm->memScoped, 1); - workNode->workType = ncclDevWorkTypeColl; - workNode->size = sizeof(struct ncclDevWorkColl); - memcpy((void*)(workNode+1), (void*)&devWork, workNode->size); - } break; - case NCCL_NVLS_REG_BUFFER: - { struct ncclDevWorkCollReg workReg = {}; + if (task->algorithm == NCCL_ALGO_NVLS_TREE || task->algorithm == NCCL_ALGO_NVLS) { + struct ncclDevWorkColl devWork = {}; + devWork.sendbuff = (void*)task->sendbuff; + devWork.recvbuff = (void*)task->recvbuff; + devWork.sendbuffOffset = task->sendbuffOffset; + devWork.recvbuffOffset = task->recvbuffOffset; + devWork.sendbuffRmtAddrs = task->sendbuffRmtAddrs; + devWork.recvbuffRmtAddrs = task->recvbuffRmtAddrs; + devWork.root = task->root; + devWork.nWarps = task->nWarps; + devWork.redOpArg = task->opDev.scalarArg; + devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr; + devWork.oneNode = (comm->nNodes == 1); + devWork.netRegUsed = devWork.regUsed = 0; + if (task->regBufType & NCCL_NET_REG_BUFFER) + devWork.netRegUsed = 1; + if (task->regBufType & (NCCL_IPC_REG_BUFFER | NCCL_NVLS_REG_BUFFER)) + devWork.regUsed = 1; + + struct ncclWorkList* workNode; + if (task->regBufType & NCCL_NVLS_REG_BUFFER) { + struct ncclDevWorkCollReg workReg = {}; workReg.coll = devWork; // C++ struct assignment /* NVLS only has one send and recv buffer registered */ workReg.dnInputs[0] = regBufSend[0]; @@ -695,15 +452,16 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool workNode = ncclMemoryStackAllocInlineArray(&comm->memScoped, 1); workNode->workType = ncclDevWorkTypeCollReg; workNode->size = sizeof(struct ncclDevWorkCollReg); - memcpy((void*)(workNode+1), (void*)&workReg, workNode->size); - } break; - default: - /* impossible value */ - WARN("Invalid regBufType %d", task->regBufType); - return ncclInvalidArgument; - } + memcpy((void*)(workNode + 1), (void*)&workReg, workNode->size); + } else { + workNode = ncclMemoryStackAllocInlineArray(&comm->memScoped, 1); + workNode->workType = ncclDevWorkTypeColl; + workNode->size = sizeof(struct ncclDevWorkColl); + memcpy((void*)(workNode + 1), (void*)&devWork, workNode->size); + } - ncclIntruQueueEnqueue(&planner->collWorkQueue, workNode); + ncclIntruQueueEnqueue(&planner->tmpCollWorkQueue, workNode); + } task = task->next; } @@ -875,15 +633,32 @@ static ncclResult_t scheduleCollTasksToPlan( struct ncclProxyOp* proxyOp; if (c == (int)devWork->channelLo) { proxyOp = &proxyOpLo; + proxyOp->loopOffset = 0; + proxyOp->channelSize = countLo * elementSize; } else if (c == (int)devWork->channelHi) { proxyOp = &proxyOpHi; + proxyOp->loopOffset = (countLo + nMidChannels * countMid) * elementSize; + proxyOp->channelSize = countHi * elementSize; } else { proxyOp = &proxyOpMid; + proxyOp->loopOffset = (countLo + (c - devWork->channelLo - 1) * countMid) * elementSize; + proxyOp->channelSize = countMid * elementSize; } proxyOp->channelId = c; proxyOp->opCount = proxyOpId; proxyOp->task.coll = task; proxyOp->rank = comm->rank; + proxyOp->ringAlgo = NULL; + if (proxyOp->reg && task->algorithm == NCCL_ALGO_RING && (task->recvNetHandles[c] || task->sendNetHandles[c])) { + if (task->func == ncclFuncAllGather) { + proxyOp->ringAlgo = new RingAGAlgorithm(task->sendbuff, task->recvbuff, comm->nRanks, comm->channels[c].ring.userRanks, proxyOp->chunkSteps, proxyOp->sliceSteps, proxyOp->chunkSize, proxyOp->sliceSize, proxyOp->loopOffset, proxyOp->channelSize, elementSize, task->count * elementSize, task->sendNetHandles[c], task->recvNetHandles[c], task->srecvNetHandles[c]); + } else if (task->func == ncclFuncAllReduce) { + proxyOp->ringAlgo = new RingARAlgorithm(task->sendbuff, task->recvbuff, comm->nRanks, comm->channels[c].ring.index, proxyOp->chunkSteps, proxyOp->sliceSteps, proxyOp->chunkSize, proxyOp->sliceSize, proxyOp->loopOffset, proxyOp->channelSize, elementSize, task->sendNetHandles[c], task->recvNetHandles[c], task->srecvNetHandles[c]); + } else if (task->func == ncclFuncBroadcast) { + proxyOp->ringAlgo = new RingBCAlgorithm(task->sendbuff, task->recvbuff, comm->rank, task->root, comm->nRanks, comm->channels[c].ring.userRanks, proxyOp->chunkSteps, proxyOp->sliceSteps, proxyOp->chunkSize, proxyOp->sliceSize, proxyOp->loopOffset, proxyOp->channelSize, task->sendNetHandles[c], task->recvNetHandles[c], task->srecvNetHandles[c]); + } + proxyOp->ringAlgo->incRefCount(); + } addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes); // Coverity reports "proxyOp->connection" as being possibly uninitialized. It's hard to // determine if that's actually true but it's also not clear if that would be an issue. @@ -900,6 +675,10 @@ static ncclResult_t scheduleCollTasksToPlan( } if (comm->rank == 0) { + INFO(NCCL_TUNING, "%s: %ld Bytes -> Algo %s proto %s channel{Lo..Hi}={%d..%d}", + ncclFuncToString(task->func), task->count * ncclTypeSize(task->datatype), ncclAlgoToString(task->algorithm), + ncclProtoToString(task->protocol), devWork->channelLo, devWork->channelHi); + if (task->isCollnet) { TRACE(NCCL_COLL, "Collective %s(%s, %s, %s, %s) count=%ld devFuncId=%d channel{Lo..Hi}={%d..%d} count=%ld chunkCount=%d", ncclFuncToString(task->func), ncclDevRedOpToString(task->opDev.op), @@ -956,6 +735,7 @@ static ncclResult_t addP2pToPlan( bool protoLL[2] = {!selfSend, !selfSend}; bool network[2] = {false, false}; bool proxySameProcess[2] = {true, true}; + void** handles[2] = {NULL, NULL}; uint8_t base = ncclP2pChannelBaseForRound(comm, p2pRound); if (!selfSend) { for (int part=0; part < nChannelsMax; part++) { @@ -981,7 +761,7 @@ static ncclResult_t addP2pToPlan( int chunkSize[2]; int chunkDataSize[2]; int chunkDataSize_u32fp8[2]; - bool registered[2] = {false, false}; + bool netRegistered[2] = {false, false}; bool ipcRegistered[2] = {false, false}; for (int dir=0; dir < 2; dir++) { // 0=recv, 1=send @@ -1007,10 +787,20 @@ static ncclResult_t addP2pToPlan( if (protocol[dir] == NCCL_PROTO_LL) chunkSize[dir] *= 2; if (network[dir]) { - if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE) { - struct ncclReg* regRecord; - NCCLCHECK(ncclRegFind(comm, addrs[dir], bytes[dir], ®Record)); - registered[dir] = regRecord && regRecord->nDevs; + if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (ncclPxnDisable(comm) || !comm->isAllNvlink)) { + int regFlag = 0; + NCCLCHECK(ncclCalloc(&handles[dir], nChannelsMax)); + for (int part = 0; part < nChannelsMax; part++) { + int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part); + struct ncclChannelPeer** channelPeers = comm->channels[channelId].peers; + int peerRank = dir ? sendRank : recvRank; + struct ncclConnector* conn = dir ? &channelPeers[peerRank]->send[connIndex] + : &channelPeers[peerRank]->recv[connIndex]; + if (conn->conn.flags & NCCL_DIRECT_NIC) + ncclRegisterP2pNetBuffer(comm, addrs[dir], bytes[dir], conn, ®Flag, &handles[dir][part], &plan->cleanupQueue); + if (!regFlag) break; + } + netRegistered[dir] = regFlag ? true : false; } } else if (bytes[dir] > 0 && addrs[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && !selfSend) { int peerRank = dir ? sendRank : recvRank; @@ -1020,12 +810,12 @@ static ncclResult_t addP2pToPlan( struct ncclConnector* conn = dir ? &channelPeers[peerRank]->send[connIndex] : &channelPeers[peerRank]->recv[connIndex]; void* regAddr = NULL; - if (conn->conn.flags & (NCCL_IPC_WRITE | NCCL_IPC_READ | NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) { + if (conn->conn.flags & (NCCL_P2P_WRITE | NCCL_P2P_READ)) { // We require users registering buffers on both sides - NCCLCHECK(registerP2pBuffer(comm, addrs[dir], peerRank, bytes[dir], ®Flag, ®Addr, &plan->cleanupQueue)); + NCCLCHECK(ncclRegisterP2pIpcBuffer(comm, addrs[dir], bytes[dir], peerRank, ®Flag, ®Addr, &plan->cleanupQueue)); if (regFlag) { - if (dir == 0 && conn->conn.flags & (NCCL_IPC_WRITE | NCCL_DIRECT_WRITE)) recvAddr = regAddr; - else if (dir == 1 && conn->conn.flags & (NCCL_IPC_READ | NCCL_DIRECT_READ)) sendAddr = regAddr; + if (dir == 0 && (conn->conn.flags & NCCL_P2P_WRITE)) recvAddr = regAddr; + else if (dir == 1 && (conn->conn.flags & NCCL_P2P_READ)) sendAddr = regAddr; } } ipcRegistered[dir] = regFlag ? true : false; @@ -1057,7 +847,7 @@ static ncclResult_t addP2pToPlan( work->channelBase = base; work->nSendChannels = nChannels[1]; work->sendProtoLL = protoLL[1]; - work->sendRegistered = registered[1]; + work->sendNetReg = netRegistered[1]; work->sendIpcReg = ipcRegistered[1]; work->sendChunkSize_u32fp8 = chunkDataSize_u32fp8[1]; work->sendRank = sendRank; @@ -1065,7 +855,7 @@ static ncclResult_t addP2pToPlan( work->sendBytes = sendBytes==-1 ? 0 : sendBytes; work->nRecvChannels = nChannels[0]; work->recvProtoLL = protoLL[0]; - work->recvRegistered = registered[0]; + work->recvNetReg = netRegistered[0]; work->recvIpcReg = ipcRegistered[0]; work->recvChunkSize_u32fp8 = chunkDataSize_u32fp8[0]; work->recvRank = recvRank; @@ -1084,7 +874,7 @@ static ncclResult_t addP2pToPlan( op->protocol = protocol[dir]; op->pattern = dir ? ncclPatternSend : ncclPatternRecv; op->chunkSize = chunkSize[dir]; - op->reg = registered[dir]; + op->reg = netRegistered[dir]; op->coll = p2pTasks[dir] ? p2pTasks[dir]->func : 0; op->task.p2p = p2pTasks[dir]; op->rank = comm->rank; @@ -1116,9 +906,10 @@ static ncclResult_t addP2pToPlan( size_t partBeg, partEnd; ncclP2pPartBounds(nParts, part, bytes, &partBeg, &partEnd); if (proxyOps[dir].reg) { - proxyOps[dir].nsteps = 1; - proxyOps[dir].recvbuff = (uint8_t*)addr+partBeg; - proxyOps[dir].nbytes = partEnd-partBeg; + (dir ? proxyOps[dir].sendbuff : proxyOps[dir].recvbuff) = (uint8_t*)addr + partBeg; + (dir ? proxyOps[dir].sendMhandle : proxyOps[dir].recvMhandle) = handles[dir][part]; + proxyOps[dir].nbytes = partEnd - partBeg; + proxyOps[dir].nsteps = DIVUP(proxyOps[dir].nbytes, NCCL_MAX_NET_SIZE); } else { proxyOps[dir].nsteps = divUp(partEnd-partBeg, chunkDataSize); proxyOps[dir].nbytes = std::min(partEnd-partBeg, chunkDataSize); @@ -1198,6 +989,8 @@ static ncclResult_t scheduleP2pTasksToPlan( // Skip send to self in-place (we don't need to support this). ncclIntruQueueDequeue(&peers[sendRank].sendQueue); ncclIntruQueueDequeue(&peers[recvRank].recvQueue); + ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, send); + ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, recv); comm->planner.nTasksP2p -= 2; } else { // Ensure room for worst case of one new batch per channel. @@ -1302,8 +1095,13 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla plan->kernelArgs->workBuf = comm->workFifoBufDev; break; case ncclDevWorkStorageTypePersistent: + // We rely on 16-byte alignment + #if __cplusplus >= 201103L + fifoBufHost = aligned_alloc(16, ROUNDUP(workBytes, 16)); + #else static_assert(16 <= alignof(max_align_t), "We rely on 16-byte alignment."); fifoBufHost = malloc(workBytes); + #endif fifoCursor = 0; fifoMask = ~0u; break; @@ -1346,37 +1144,41 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla break; case ncclDevWorkStorageTypePersistent: { ncclResult_t result = ncclSuccess; + struct uploadWork_cleanup_t* cleanup = nullptr; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; void* fifoBufDev = nullptr; - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + CUDACHECKGOTO(cudaThreadExchangeStreamCaptureMode(&mode), result, fail); // Acquire deviceStream to gain access to deviceStream.cudaStream. Since the // user's graph will be launched later, and it also acquires the deviceStream, // it will observe this upload. - NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), result, finish_scope); + NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), result, fail); - CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, comm->sharedRes->deviceStream.cudaStream), result, finish_scope); + CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, comm->sharedRes->deviceStream.cudaStream), result, fail); plan->workBufPersistent = fifoBufDev; plan->kernelArgs->workBuf = fifoBufDev; - CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, comm->sharedRes->deviceStream.cudaStream), result, finish_scope); + // coverity[uninit_use_in_call:FALSE] => fifoBufHost is never NULL + CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, comm->sharedRes->deviceStream.cudaStream), result, fail); cudaEvent_t memcpyDone; - CUDACHECKGOTO(cudaEventCreateWithFlags(&memcpyDone, cudaEventDisableTiming), result, finish_scope); - CUDACHECKGOTO(cudaEventRecord(memcpyDone, comm->sharedRes->deviceStream.cudaStream), result, finish_scope); + CUDACHECKGOTO(cudaEventCreateWithFlags(&memcpyDone, cudaEventDisableTiming), result, fail); + CUDACHECKGOTO(cudaEventRecord(memcpyDone, comm->sharedRes->deviceStream.cudaStream), result, fail); - struct uploadWork_cleanup_t* cleanup; - NCCLCHECK(ncclCalloc(&cleanup, 1)); + NCCLCHECKGOTO(ncclCalloc(&cleanup, 1), result, fail); cleanup->base.fn = uploadWork_cleanup_fn; cleanup->base.event = memcpyDone; cleanup->hostBuf = fifoBufHost; - ncclIntruQueueEnqueue(&comm->eventCallbackQueue, &cleanup->base); + ncclIntruQueueEnqueue(&comm->eventCallbackQueue, (struct ncclCommEventCallback *)cleanup); - NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream), result, finish_scope); - NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, finish_scope); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream), result, fail); + NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, fail); finish_scope: - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (result != ncclSuccess) return result; + if (mode != cudaStreamCaptureModeRelaxed) (void)cudaThreadExchangeStreamCaptureMode(&mode); + return result; + fail: + if (!cleanup) free(fifoBufHost); + goto finish_scope; } break; default: break; } @@ -1388,6 +1190,7 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan* uint64_t p2pOpBump[MAXCHANNELS] = {/*0...*/}; // Advance comm's collOpCount by number of colls in this plan. comm->sharedRes->collOpCount += plan->collOpCount; + comm->collOpCount += plan->collOpCount; struct ncclProxyOp* op = ncclIntruQueueHead(&plan->proxyOpQueue); while (op != nullptr) { @@ -1410,18 +1213,9 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan* NCCLCHECK(ncclProxySaveOp(comm, op, nullptr)); op->opCount = oldId; // Restore for next uploadProxyOps() - - struct ncclProxyOp* opNext = op->enqNext; - if (!plan->persistent) { - // Non-persistent kernels upload ops only once so can be free'd here. - ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, op); - } - op = opNext; + op = op->enqNext; } - // Erase proxyOpQueue since all ops were free'd back to mempool. - if (!plan->persistent) ncclIntruQueueConstruct(&plan->proxyOpQueue); - for (int c=0; c < MAXCHANNELS; c++) { // Advance channel's p2pOpCount by number of p2p's in this plan channel. comm->sharedRes->p2pOpCount[c] += p2pOpBump[c]; @@ -1450,6 +1244,8 @@ static void CUDART_CB hostStreamPlanCallback(void *plan_) { if (result != ncclSuccess) { WARN("hostStreamPlanCallback() failed : %s", ncclGetErrorString(result)); } + if (!plan->persistent) ncclAtomicRefCountDecrement(&plan->comm->noncapturedRefs); + return; } static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* me) { @@ -1462,32 +1258,41 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* CUDACHECK(cudaFree(plan->workBufPersistent)); CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); } - struct ncclProxyOp* q = ncclIntruQueueHead(&plan->proxyOpQueue); - while (q != nullptr) { - struct ncclProxyOp* q1 = q->enqNext; - ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, q); - q = q1; - } - struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue); - while (ct != nullptr) { - struct ncclTaskColl* ct1 = ct->next; - ncclMemoryPoolFree(&comm->memPool_ncclTaskColl, ct); - ct = ct1; - } - struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue); - while (pt != nullptr) { - struct ncclTaskP2p* pt1 = pt->next; - ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, pt); - pt = pt1; - } - ncclResult_t result = ncclSuccess; - while (!ncclIntruQueueEmpty(&plan->cleanupQueue)) { - struct ncclCommCallback* cb = ncclIntruQueueDequeue(&plan->cleanupQueue); - ncclResult_t res1 = cb->fn(comm, cb); // Expect to reclaim memory of cb - if (res1 != ncclSuccess) result = res1; - } - NCCLCHECK(result); } + // Free coll tasks + struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue); + while (ct != nullptr) { + struct ncclTaskColl* ct1 = ct->next; + free(ct->sendNetHandles); + free(ct->recvNetHandles); + free(ct->srecvNetHandles); + ncclMemoryPoolFree(&comm->memPool_ncclTaskColl, ct); + ct = ct1; + } + // Free p2p tasks + struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue); + while (pt != nullptr) { + struct ncclTaskP2p* pt1 = pt->next; + ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, pt); + pt = pt1; + } + // Free proxy ops + struct ncclProxyOp* q = ncclIntruQueueHead(&plan->proxyOpQueue); + while (q != nullptr) { + struct ncclProxyOp* q1 = q->enqNext; + if (q->ringAlgo && q->ringAlgo->decRefCount() == 0) delete q->ringAlgo; + ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, q); + q = q1; + } + // Run other free callbacks + ncclResult_t result = ncclSuccess; + while (!ncclIntruQueueEmpty(&plan->cleanupQueue)) { + struct ncclCommCallback* cb = ncclIntruQueueDequeue(&plan->cleanupQueue); + ncclResult_t res1 = cb->fn(comm, cb); // Expect to reclaim memory of cb + if (res1 != ncclSuccess) result = res1; + } + NCCLCHECK(result); + // Free plan struct ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan); return ncclSuccess; } @@ -1509,10 +1314,6 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { planner->persistent = persistent; int nPlans = 0; - // Poll for callbacks sent to us from other threads. Typically these free - // resources from to our memory pools. - NCCLCHECK(ncclCommPollCallbacks(comm, /*waitSome=*/false)); - if (planner->nTasksColl + planner->nTasksP2p != 0) { do { memset(&planner->wipPlan, 0, sizeof(planner->wipPlan)); @@ -1577,7 +1378,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { } NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure); - if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking) { + if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking || __atomic_load_n(&comm->noncapturedRefs, __ATOMIC_ACQUIRE)) { // We have to launch host tasks to push proxy args. We are careful to only // do this if necessary since host tasks impose a high performance cost in CUDA. bool acquired = false; @@ -1587,6 +1388,8 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { acquired = true; NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure); } + if (!persistent) ncclAtomicRefCountIncrement(&comm->noncapturedRefs); + plan->isHostCbEnq = true; NCCLCHECKGOTO(ncclStrongStreamLaunchHost(planner->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure); } } @@ -1602,6 +1405,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { NCCLCHECKGOTO(ncclCudaGraphAddDestructor(planner->capturingGraph, persistentDestructor, (void*)planHead), result, failure); } } + failure: return result; } @@ -1694,7 +1498,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan } ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) { - if (!(plan->persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking)) { + if (!(plan->persistent || ncclCudaLaunchBlocking || plan->isHostCbEnq)) { // We are not using the host stream for proxy ops and reclaimation submission. NCCLCHECK(hostStreamPlanTask(comm, plan)); } else { @@ -1778,8 +1582,7 @@ static void initCollCostTable(float** collCostTable) { static ncclResult_t updateCollCostTable( struct ncclComm* comm, struct ncclTaskColl* info, size_t nBytes, int collNetSupport, int nvlsSupport, int numPipeOps, - float** collCostTable, int* backupAlgo, int* backupProto, float* backupTime - ) { + float** collCostTable) { float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; if (comm->nRanks == 1) { @@ -1799,16 +1602,12 @@ static ncclResult_t updateCollCostTable( if (a == NCCL_ALGO_PAT && info->func == ncclFuncReduceScatter && (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv)) continue; for (int p=0; pfunc, a, p, nBytes, numPipeOps, &time, &backup)); - if (!backup) { - table[a][p] = time; - } else { - if (time >= 0.0 && time < *backupTime) { - *backupAlgo = a; - *backupProto = p; - *backupTime = time; + NCCLCHECK(ncclTopoGetAlgoTime(comm, info->func, a, p, nBytes, numPipeOps, &table[a][p])); + // Relegate fp8 reduction trees of sufficient depth that they incur precision loss + // to be least preferred. + if (info->datatype == ncclFloat8e4m3 || info->datatype == ncclFloat8e5m2) { + if (a == NCCL_ALGO_RING && comm->nRanks > 8) { + table[a][p] *= 1024.0; // Any factor large enough to act as a partition between lossy and non-lossy algos. } } } @@ -1819,7 +1618,7 @@ static ncclResult_t updateCollCostTable( static ncclResult_t topoGetAlgoInfo( struct ncclComm* comm, struct ncclTaskColl* info, size_t nBytes, - float** collCostTable, int backupAlgo, int backupProto, float backupTime, ncclSimInfo_t* simInfo + float** collCostTable, ncclSimInfo_t* simInfo ) { float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; @@ -1844,15 +1643,19 @@ static ncclResult_t topoGetAlgoInfo( // Yes, we are first assigning and then testing if protocol is sane, but that's OK in this case. // coverity[check_after_sink] if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) { - if (backupAlgo == NCCL_ALGO_UNDEF || backupProto == NCCL_PROTO_UNDEF) { - WARN("Error : no algorithm/protocol available"); - return ncclInternalError; + char ncclAlgoEnvStr[1024] = ""; + char ncclProtoEnvStr[1024] = ""; + char* algoEnv = getenv("NCCL_ALGO"); + if (algoEnv) { + snprintf(ncclAlgoEnvStr, 1023, " NCCL_ALGO was set to %s.", algoEnv); } - info->algorithm = backupAlgo; - info->protocol = backupProto; - time = backupTime; + char* protoEnv = getenv("NCCL_PROTO"); + if (protoEnv) { + snprintf(ncclProtoEnvStr, 1023, " NCCL_PROTO was set to %s.", protoEnv); + } + WARN("Error : no algorithm/protocol available for function %s with datatype %s.%s%s", ncclFuncToString(info->func), ncclDatatypeToString(info->datatype), ncclAlgoEnvStr, ncclProtoEnvStr); + return (algoEnv || protoEnv) ? ncclInvalidUsage : ncclInternalError; } - if (comm->rank == 0) INFO(NCCL_TUNING, "%s: %ld Bytes -> Algo %d proto %d time %f", ncclFuncToString(info->func), nBytes, info->algorithm, info->protocol, time); if (simInfo) simInfo->estimatedTime = time; TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time); @@ -1913,19 +1716,24 @@ static ncclResult_t getAlgoInfo( info->algorithm = NCCL_ALGO_UNDEF; info->protocol = NCCL_PROTO_UNDEF; int nMaxChannels = 0; - int backupAlgo = NCCL_ALGO_UNDEF; - int backupProto = NCCL_PROTO_UNDEF; - float backupTime = 3600000000.0; float collCostTable[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; initCollCostTable((float **)collCostTable); - NCCLCHECK(updateCollCostTable(comm, info, nBytes, collNetSupport, nvlsSupport, numPipeOps, (float **)collCostTable, &backupAlgo, &backupProto, &backupTime)); + NCCLCHECK(updateCollCostTable(comm, info, nBytes, collNetSupport, nvlsSupport, numPipeOps, (float **)collCostTable)); if (comm->tuner != NULL) { + size_t elementSize = ncclTypeSize(info->datatype); + size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count); + size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count); + struct ncclReg* regSendBuf; + struct ncclReg* regRecvBuf; + NCCLCHECK(ncclRegFind(comm, info->sendbuff, sendbuffSize, ®SendBuf)); + NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, ®RecvBuf)); + int regBuff = ((regSendBuf && regRecvBuf) || (ncclCudaGraphValid(comm->planner.capturingGraph) && ncclParamGraphRegister())); NCCLCHECK(comm->tuner->getCollInfo( comm->tunerContext, info->func, nBytes, numPipeOps, (float **)collCostTable, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, - &nMaxChannels)); + regBuff, &nMaxChannels)); } - NCCLCHECK(topoGetAlgoInfo(comm, info, nBytes, (float **)collCostTable, backupAlgo, backupProto, backupTime, simInfo)); + NCCLCHECK(topoGetAlgoInfo(comm, info, nBytes, (float **)collCostTable, simInfo)); info->nMaxChannels = nMaxChannels == 0 ? info->nMaxChannels : nMaxChannels; return ncclSuccess; } @@ -1975,37 +1783,7 @@ static ncclResult_t calcCollChunking( } int nstepsPerLoop, nchunksPerLoop; - switch (pattern) { - case ncclPatternTreeUp: - case ncclPatternTreeDown: - case ncclPatternTreeUpDown: - case ncclPatternPatUp: - case ncclPatternPatDown: - case ncclPatternPipelineFrom: - case ncclPatternPipelineTo: - case ncclPatternCollnetChain: - nstepsPerLoop = nchunksPerLoop = 1; - break; - case ncclPatternNvls: - nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].nvls.nHeads; - break; - case ncclPatternCollnetDirect: - nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].collnetDirect.nHeads; - break; - case ncclPatternRing: - nstepsPerLoop = comm->nRanks-1; nchunksPerLoop = comm->nRanks; - break; - case ncclPatternRingTwice: - nstepsPerLoop = 2*(comm->nRanks-1); nchunksPerLoop = comm->nRanks; - break; - case ncclPatternNvlsTree: - nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].nvls.nHeads; - break; - default: - WARN("Unknown pattern %d", pattern); - return ncclInternalError; - } - + size_t loopOffset = 0; int stepSize = comm->buffSizes[info->protocol]/NCCL_STEPS; int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1; int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1; @@ -2066,22 +1844,60 @@ static ncclResult_t calcCollChunking( // Compute directFlags of work struct. if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) { // Set direct direction for broadcast-gather (read or write) - *outDirectFlags = (nBytes/nChannels <= 1024 * 4) ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; + *outDirectFlags = (nBytes/nChannels <= 1024 * 4) ? NCCL_P2P_READ : NCCL_P2P_WRITE; } else { *outDirectFlags = 0; } // Compute nSteps for proxies - //if (comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->func, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol); chunkSize = chunkSize / grainSize * grainSize; // align chunkSize to multiple grainSize - int nLoops = (int)DIVUP(nBytes, size_t(nChannels)*nchunksPerLoop*chunkSize); + switch (pattern) { + case ncclPatternTreeUp: + case ncclPatternTreeDown: + case ncclPatternTreeUpDown: + case ncclPatternPatUp: + case ncclPatternPatDown: + case ncclPatternPipelineFrom: + case ncclPatternPipelineTo: + case ncclPatternCollnetChain: + nstepsPerLoop = nchunksPerLoop = 1; + break; + case ncclPatternNvls: + nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].nvls.nHeads; + loopOffset = nChannels * chunkSize * comm->channels[0].nvls.headRank; + break; + case ncclPatternCollnetDirect: + nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].collnetDirect.nHeads; + loopOffset = nChannels * chunkSize * comm->channels[0].collnetDirect.headRank; + break; + case ncclPatternRing: + nstepsPerLoop = comm->nRanks-1; nchunksPerLoop = comm->nRanks; + break; + case ncclPatternRingTwice: + nstepsPerLoop = 2*(comm->nRanks-1); nchunksPerLoop = comm->nRanks; + break; + case ncclPatternNvlsTree: + nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].nvls.nHeads; + break; + default: + WARN("Unknown pattern %d", pattern); + return ncclInternalError; + } + + // Compute nSteps for proxies + size_t loopSize = size_t(nChannels)*nchunksPerLoop*chunkSize; + int nLoops = (int)DIVUP(nBytes, loopSize); memset(proxyOp, 0, sizeof(*proxyOp)); proxyOp->nsteps = nstepsPerLoop * nLoops * chunkSteps; proxyOp->sliceSteps = sliceSteps; proxyOp->chunkSteps = chunkSteps; proxyOp->chunkSize = chunkSize; + proxyOp->sliceSize = chunkSize / chunkSteps * sliceSteps; + proxyOp->loopSize = loopSize; + proxyOp->loopOffset = loopOffset; proxyOp->protocol = info->protocol; proxyOp->dtype = info->datatype; + proxyOp->algorithm = info->algorithm; if (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv) { proxyOp->redOp = ncclSum; // Network sees avg as sum } else { @@ -2090,17 +1906,50 @@ static ncclResult_t calcCollChunking( proxyOp->pattern = pattern; proxyOp->coll = info->func; proxyOp->root = info->root; + proxyOp->isOneRPN = comm->isOneRPN; // This is used by P2P to reduce the receive buffer size. We don't use it in collectives // because some protocols need to transmit more than the total size, plus they sometimes // round up proxyOp->nbytes = stepSize*sliceSteps; - if (info->regBufType == NCCL_COLLNET_REG_BUFFER) { + if (info->regBufType & NCCL_NET_REG_BUFFER) { proxyOp->reg = 1; - proxyOp->nsteps = DIVUP(nBytes, NCCL_MAX_COLLNET_SIZE); - proxyOp->sendMhandle = info->sendMhandle; + if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT || info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) { + if (proxyOp->isOneRPN) { + proxyOp->nsteps = 1; + proxyOp->loopOffset = 0; + proxyOp->sendbuff = (uint8_t*)info->sendbuff; + proxyOp->sendMhandle = info->sendMhandle; + } else { + if (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) { + proxyOp->nbytes = nBytes / nchunksPerLoop; + proxyOp->loopSize = proxyOp->loopSize / nchunksPerLoop; + proxyOp->loopOffset = 0; + if (info->func == ncclFuncAllGather) { + proxyOp->sendbuff = (uint8_t*)info->sendbuff; + proxyOp->sendMhandle = info->sendMhandle; + } + } else { + proxyOp->sendbuff = (uint8_t*)info->recvbuff; + proxyOp->sendMhandle = info->recvMhandle; + } + } + } else if (info->algorithm == NCCL_ALGO_RING) { + if (proxyOp->isOneRPN && info->func == ncclFuncAllGather) { + proxyOp->chunkSize = NCCL_MAX_NET_SIZE; + proxyOp->sliceSize = NCCL_MAX_NET_SIZE; + proxyOp->chunkSteps = 1; + proxyOp->sliceSteps = 1; + proxyOp->loopSize = size_t(nChannels) * nchunksPerLoop * proxyOp->chunkSize; + proxyOp->nsteps = DIVUP(nBytes, proxyOp->loopSize) * nstepsPerLoop; + proxyOp->loopOffset = 0; + } + } else { + WARN("Net registration invalid algorithm %s", ncclAlgoToString(info->algorithm)); + return ncclInternalError; + } + proxyOp->recvMhandle = info->recvMhandle; - proxyOp->sendbuff = (uint8_t*)info->sendbuff; proxyOp->recvbuff = (uint8_t*)info->recvbuff; proxyOp->nbytes = nBytes; } else { @@ -2119,7 +1968,7 @@ static ncclResult_t calcCollChunking( proxyOp->nbytes = DIVUP(nBytes, nChannels); } - *outChunkSize = chunkSize; + *outChunkSize = proxyOp->chunkSize; return ncclSuccess; } @@ -2130,10 +1979,13 @@ static ncclResult_t hostToDevRedOp( int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64; - half f16; float f32; double f64; + __half f16; float f32; double f64; #if defined(__CUDA_BF16_TYPES_EXIST__) __nv_bfloat16 bf16; #endif + #if defined(__CUDA_FP8_TYPES_EXIST__) + __nv_fp8_storage_t f8; + #endif void *ptr; }; u64 = 0; @@ -2144,7 +1996,8 @@ static ncclResult_t hostToDevRedOp( if (nbits <= 0) return ncclInvalidArgument; uint64_t allBits = uint64_t(-1)>>(64-nbits); uint64_t signBit = allBits^(allBits>>1); - + bool datatype_signed = false; + switch (int(op)) { case ncclSum: opFull->op = ncclDevSum; break; case ncclProd: opFull->op = ncclDevProd; break; @@ -2162,10 +2015,22 @@ static ncclResult_t hostToDevRedOp( case ncclAvg: switch ((int)datatype) { case ncclInt8: case ncclInt32: case ncclInt64: + datatype_signed = true; + // no break, we want to fall through... case ncclUint8: case ncclUint32: case ncclUint64: opFull->op = ncclDevSumPostDiv; - u64 = comm->nRanks; + u64 = comm->nRanks<<1 | datatype_signed; break; + #if defined(__CUDA_FP8_TYPES_EXIST__) + case ncclFloat8e4m3: + opFull->op = ncclDevPreMulSum; + f8 = __nv_cvt_float_to_fp8(float(1.0/comm->nRanks), __NV_SATFINITE, __NV_E4M3); + break; + case ncclFloat8e5m2: + opFull->op = ncclDevPreMulSum; + f8 = __nv_cvt_float_to_fp8(float(1.0/comm->nRanks), __NV_SATFINITE, __NV_E5M2); + break; + #endif case ncclFloat16: opFull->op = ncclDevPreMulSum; f16 = __float2half(float(1.0/comm->nRanks)); // __double2half not supported pre CUDA 11.x @@ -2257,6 +2122,13 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { // Empty collectives can be discarded. if (info->count == 0) return ncclSuccess; + if (info->datatype == ncclFloat8e4m3 || info->datatype == ncclFloat8e5m2) { + if (comm->minCompCap < 90) { + WARN("FP8 reduction support begins with sm90 capable devices."); + return ncclInvalidArgument; + } + } + // Copy reduction op state from op handle into info struct here since the // op handle may be destroyed before ncclGroupEnd(). struct ncclDevRedOpFull opDev; diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 999312a0d..6e9356826 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -248,11 +248,31 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0); int ncclTopoUserP2pLevel = -1; -ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank) { +ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, + int* p2p, int *read, int* intermediateRank) { + int mnnvl = 0; + struct ncclPeerInfo* info1 = NULL; + struct ncclPeerInfo* info2 = NULL; *p2p = 0; if (read) *read = 0; if (intermediateRank) *intermediateRank = -1; + // Rule out different nodes / isolated containers + if (comm) { + info1 = comm->peerInfo+rank1; + info2 = comm->peerInfo+rank2; + if (info1->hostHash != info2->hostHash) { + if (comm->MNNVL) { + NCCLCHECK(ncclTopoCheckMNNVL(comm->topo, info1, info2, &mnnvl)); + if (!mnnvl) return ncclSuccess; + } else { + return ncclSuccess; + } + } else if (info1->shmDev != info2->shmDev) { + return ncclSuccess; + } + } + // Get GPUs from topology int g1, g2; NCCLCHECK(ncclTopoRankToIndex(system, rank1, &g1)); @@ -297,7 +317,8 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int rank1, int rank if (*p2p == 1) { // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to // validate against NVML at all since they are pretending to be on other hw. - if (g1 != g2 && ncclParamIgnoreDisabledP2p() != 2) { + if (g1 != g2 && (comm == NULL || (info1->hostHash == comm->peerInfo[comm->rank].hostHash && + info1->hostHash == info2->hostHash)) && ncclParamIgnoreDisabledP2p() != 2) { int indexes[3] = {-1,-1,-1}; int verticeN = 0; NCCLCHECK(ncclNvmlEnsureInitialized()); @@ -356,14 +377,14 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2); int ncclTopoUserGdrLevel = -1; -ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int64_t netId, int read, int* useGdr) { +ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, int* useGdr) { *useGdr = 0; // Get GPU and NET int n, g; NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n)); struct ncclTopoNode* net = system->nodes[NET].nodes+n; - NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g)); + NCCLCHECK(ncclTopoRankToIndex(system, rank, &g)); struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; // Check that both the NIC and GPUs support it @@ -404,12 +425,32 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int6 distance = proxyGpu->paths[NET][n].type; } if (distance > netGdrLevel) { - INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %lx (distance %d > %d)", busId, netId, distance, netGdrLevel); + INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %d / HCA %lx (distance %d > %d)", rank, netId, distance, netGdrLevel); return ncclSuccess; } *useGdr = 1; - INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %lx / HCA %lx (distance %d <= %d), read %d", busId, netId, distance, netGdrLevel, read); + INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d", rank, netId, distance, netGdrLevel, read); + return ncclSuccess; +} + +ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail) { + int netNum = system->nodes[NET].count; + int useGdr = 0; + *avail = false; + for (int n = 0; n < netNum; n++) { + int64_t netId = system->nodes[NET].nodes[n].id; + NCCLCHECK(ncclTopoCheckGdr(system, rank, netId, 1, &useGdr)); + if (useGdr) { + *avail = true; + break; + } + NCCLCHECK(ncclTopoCheckGdr(system, rank, netId, 0, &useGdr)); + if (useGdr) { + *avail = true; + break; + } + } return ncclSuccess; } @@ -417,12 +458,17 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int6 NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 0); // Determine whether we need to flush the GDR recv buffers -ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush) { +ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush) { + *flush = 1; + ncclNetProperties_t props; + NCCLCHECK(comm->ncclNet->getProperties(netDev, &props)); + if (props.forceFlush == 1 || ncclParamNetForceFlush()) return ncclSuccess; int g; - NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g)); + struct ncclTopoSystem* system = comm->topo; + NCCLCHECK(ncclTopoRankToIndex(system, rank, &g)); struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; // Flush is required on Ampere and earlier - *flush = gpu->gpu.cudaCompCap < 90 ? 1 : ncclParamNetForceFlush(); + if (gpu->gpu.cudaCompCap >= 90) *flush = 0; return ncclSuccess; } @@ -516,7 +562,7 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netId, NULL, &proxyRank)); if (proxyRank == comm->rank) continue; int useGdr; - NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->busId, netId, 1, &useGdr)); + NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->rank, netId, 1, &useGdr)); if (useGdr == 0) continue; int found = 0; for (int r=0; rnodes[GPU].count; g++) { for (int p=0; pnodes[GPU].count; p++) { int p2p; - NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].gpu.rank, system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL)); + NCCLCHECK(ncclTopoCheckP2p(comm, system, system->nodes[GPU].nodes[p].gpu.rank, + system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL)); if (p2p == 0) { // Divert all traffic through the CPU int cpu; @@ -618,7 +665,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm if (gpu->paths[NET][n].type < PATH_PHB) { // Update path when we dont want to / can't use GPU Direct RDMA. int gdr; - NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr)); + NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].gpu.rank, netNode->id, 0, &gdr)); if (gdr == 0) { // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU int localCpu; diff --git a/src/graph/search.cc b/src/graph/search.cc index ad6f58054..9b72ac160 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -1142,7 +1142,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr offset = strlen(line); } if (system->nodes[NET].count > 0) { - sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c])); + sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c+1])); offset = strlen(line); } INFO(NCCL_GRAPH, "%s", line); diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 9771ae05c..d758ac989 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -296,7 +296,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset)); } else { if (link->remNode->type == NET) { - sprintf(line+nextOffset, "%s/%lx-%lx (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id), link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw); + sprintf(line+nextOffset, "%s/%lx-%lx (%d/%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id), link->remNode->net.collSupport, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw); } else { sprintf(line+nextOffset, "%s/%lx-%lx", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id)); } @@ -383,6 +383,7 @@ ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* s if (strcmp(xmlNet->name, "net") != 0) continue; int index; NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index)); + // This means that the "dev" attribute wasn't set on this net xml node. That means it should not be added to the system topology graph if (index == -1) continue; NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic, systemId)); } @@ -403,7 +404,7 @@ struct kvDict kvDictPciGen[] = { { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */ { "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 }, { NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane -ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent, int systemId) { +ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent, int systemId, int numaId) { const char* str; int type; @@ -430,9 +431,9 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s if (xmlNic != NULL) { type = NIC; // Ignore sub device ID and merge multi-port NICs into one PCI device. - busId &= 0xfffffffffffffff0; struct ncclTopoNode* nicNode = NULL; - int64_t id = NCCL_TOPO_ID(systemId, busId); + int64_t localNicId = NCCL_TOPO_LOCAL_NIC_ID(numaId, busId); + int64_t id = NCCL_TOPO_ID(systemId, localNicId); NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, id)); if (nicNode == NULL) { NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, id)); @@ -453,7 +454,7 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s for (int s=0; snSubs; s++) { struct ncclXmlNode* xmlSubPci = xmlPci->subs[s]; if (strcmp(xmlSubPci->name, "pcilink") != 0) { // PCI links will be added later - NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId)); + NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId, numaId)); } } } @@ -520,12 +521,14 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s } for (int s=0; snSubs; s++) { struct ncclXmlNode* node = xmlCpu->subs[s]; - if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu, systemId)); + if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu, systemId, numaId)); if (strcmp(node->name, "nic") == 0) { struct ncclTopoNode* nic = NULL; - NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0)); + int64_t localNicId = NCCL_TOPO_LOCAL_NIC_ID(numaId, 0); + int64_t id = NCCL_TOPO_ID(systemId, localNicId); + NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, id)); if (nic == NULL) { - NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, NCCL_TOPO_ID(systemId, 0))); + NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, id)); NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_BW)); NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_BW)); } @@ -725,14 +728,528 @@ ncclResult_t ncclTopoRefreshBcmP2pLinks(void) { return ncclSuccess; } -ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) { +// This is just checking for direct descendence +int ncclTopoCheckPix(ncclXmlNode* common, ncclXmlNode** nodes, int nNodes) { + const char* tempBusId; + // If the common parent isn't a pci switch, then this isn't PIX + NCCLCHECK(xmlGetAttrStr(common, "busid", &tempBusId)); + if (tempBusId == NULL) return 0; + TRACE(NCCL_GRAPH, "Checking pix for busid=%s", tempBusId); + + // All the nodes must have a "nic" which is a parent, and then a pci node (busid) which must be a child of the "common" + for (int i = 0; i < nNodes; i++) { + ncclXmlNode* node = nodes[i]; + if (strcmp(node->name, "net") == 0) { + node = node->parent; + if (node == NULL) return 0; + if (strcmp(node->name, "nic") == 0) { + node = node->parent; + if (node == NULL) return 0; + // All nodes must descend from the same first level pci switch + if (strcmp(node->name, "pci") == 0) { + TRACE(NCCL_GRAPH, "Comparing parent of node=%p to common=%p", node->parent, common); + if (node->parent != common) return 0; + } + } + } + } + + return 1; +} + +#define NCCL_TOPO_XML_DEPTH_MAX 256 +typedef struct xmlNodeStack { + ncclXmlNode* elems[NCCL_TOPO_XML_DEPTH_MAX]; + int tail; + + ncclXmlNode* top() { + if (!empty()) { + return elems[tail - 1]; + } else { + return NULL; + } + } + + ncclXmlNode* pop() { + ncclXmlNode* node = top(); + if (node) { + tail--; + } + return node; + } + + void push(ncclXmlNode* node) { + if (tail < NCCL_TOPO_XML_DEPTH_MAX) { + elems[tail++] = node; + } + } + + bool empty() { + return tail == 0; + } + +} xmlNodeStack; + +// 1. Find the common parent xmlNode between the given set of nodes +ncclResult_t ncclTopoGetPath(ncclXmlNode** nodes, int nNodes, int* path, ncclXmlNode** parent) { + // Track a stack of parents per-net node being merged + xmlNodeStack* parents; + NCCLCHECK(ncclCalloc(&parents, nNodes)); + // Find the common parent + ncclXmlNode* common = NULL; + + if (nNodes == 1) { + common = nodes[0]; + *path = PATH_LOC; + goto out; + } + + for (int i = 0; i < nNodes; i++) { + ncclXmlNode* temp; + temp = nodes[i]; + while (temp) { + parents[i].push(temp); + temp = strcmp(temp->name, "system") == 0 ? NULL : temp->parent; + } + } + + common = NULL; + int c; + c = 1; + while (c && !parents[0].empty()) { + ncclXmlNode* temp = parents[0].top(); + for (int i = 1; i < nNodes; i++) { + if (!parents[i].empty()) { + c &= (temp == parents[i].top()); + } else { + c = 0; + break; + } + } + + if (c) { + common = temp; + if (common == NULL) TRACE(NCCL_GRAPH, "COMMON IS NULL"); + for (int i = 0; i < nNodes; i++) { + parents[i].pop(); + } + // Check multi-port while we still have the mismatched parents + // For multi-port to be true, all parents (peers) must have the busId attribute with all but the last character matching + } else { + int multiPort = 1; + const char* tempBusId; + + NCCLCHECK(xmlGetAttr(temp, "busid", &tempBusId)); + if (tempBusId) { + for (int i = 1; i < nNodes; i++) { + if (!parents[i].empty()) { + const char* busId; + NCCLCHECK(xmlGetAttr(parents[i].top(), "busid", &busId)); + if (busId) { + if (strlen(busId) != strlen(tempBusId)) { + multiPort = 0; + break; + } + if (strncmp(busId, tempBusId, strlen(busId)-1) != 0) { + multiPort = 0; + break; + } + } else { + multiPort = 0; + break; + } + } + } + } else { + multiPort = 0; + } + + if (multiPort) { + *path = PATH_PORT; + goto out; + } + } + } + + if (common == NULL) { + *path = PATH_DIS; + } else if (strcmp(common->name,"system") == 0) { + *path = PATH_SYS; + } else if (strcmp(common->name, "cpu") == 0) { + *path = PATH_PHB; + } else if (strcmp(common->name, "nic") == 0) { + *path = PATH_PORT; + } else if (strcmp(common->name, "net") == 0) { + *path = PATH_PORT; + } else if (ncclTopoCheckPix(common, nodes, nNodes)) { + *path = PATH_PIX; + } else { + *path = PATH_PXB; + } + +out: + *parent = common; + free(parents); + return ncclSuccess; +} + +ncclResult_t ncclTopoMakeUniqueBusId(struct ncclXml* xml, char* busId, struct ncclXmlNode** pciNode, struct ncclXmlNode* parent) { + int i = 0; + int64_t rBusId; + NCCLCHECK(busIdToInt64(busId, &rBusId)); + // Try to find an unused busid - NCCL expects leaf busid to be unique + while (i < 100) { + rBusId++; + TRACE(NCCL_GRAPH, "Trying to make new busId %lx", rBusId); + int64ToBusId(rBusId, busId); + struct ncclXmlNode* temp = NULL; + NCCLCHECK(xmlFindTagKv(xml, "pci", &temp, "busid", busId)); + if (temp == NULL) { + NCCLCHECK(xmlAddNode(xml, parent, "pci", pciNode)); + NCCLCHECK(xmlSetAttr(*pciNode, "busid", busId)); + TRACE(NCCL_GRAPH, "Made new busId %lx", rBusId); + return ncclSuccess; + } + TRACE(NCCL_GRAPH, "Conflicting busId %lx", rBusId); + i++; + } + + WARN("TOPO/NET : Couldn't generate unique busId after %d tries", i); + return ncclInternalError; +} + +ncclResult_t ncclTopoMakePciParent(struct ncclXml* xml, struct ncclXmlNode** parent, struct ncclXmlNode* physNetNode) { + struct ncclXmlNode* newBusId = NULL; + struct ncclXmlNode* pci = physNetNode->parent; + if (pci) { + pci = pci->parent; + if (pci) { + if (strcmp(pci->name, "pci") == 0) { + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + memset(busId, 0, sizeof(busId)); + const char* originalBusId; + // Seed busId with the current NIC 0's busId to make discovering a unique hash quicker + NCCLCHECK(xmlGetAttrStr(pci, "busid", &originalBusId)); + snprintf(busId, sizeof(busId), "%s", originalBusId); + NCCLCHECK(ncclTopoMakeUniqueBusId(xml, busId, &newBusId, *parent)); + for (int i = 0; i < pci->nAttrs; i++) { + NCCLCHECK(xmlSetAttr(newBusId, pci->attrs[i].key, pci->attrs[i].value)); + } + NCCLCHECK(xmlSetAttr(newBusId, "busid", busId)); + *parent = newBusId; + } + } + } + + if (newBusId == NULL) { + const char* name; + NCCLCHECK(xmlGetAttr(physNetNode, "name", &name)); + WARN("TOPO/NET : Can't find busId of child 0 %s", name); + return ncclInternalError; + } + + return ncclSuccess; +} + +ncclResult_t ncclTopoMakeVnic(ncclComm_t comm, struct ncclXml* xml, ncclNetVDeviceProps_t* vProps, +struct ncclXmlNode** physNetNodes, struct ncclXmlNode** netNode, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) { + if (vProps->ndevs > NCCL_NET_MAX_DEVS_PER_NIC) { + WARN("TOPO/NET : Tried to merge too many NICs. %d > %d", vProps->ndevs, NCCL_NET_MAX_DEVS_PER_NIC); + return ncclInternalError; + } + + // Trigger the merge, then get the new device's properties + int vDevIndex = 0; + ncclResult_t ret = makeVDevice(&vDevIndex, vProps); + if (ret == ncclInvalidUsage) { + WARN("TOPO/NET : Tried merging multiple devices together and failed. Try setting NCCL_NET_MERGE_LEVEL=LOC"); + NCCLCHECK(ret); + } + + INFO(NCCL_GRAPH, "TOPO/NET : Made vNic %d", vDevIndex); + return ncclSuccess; +} + +ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) { + INFO(NCCL_ENV|NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str); + char* semi_token; + char* semi = strtok_r(str, ";", &semi_token); + while (semi) { + TRACE(NCCL_NET, "Fusing %s", semi); + struct netIf userIfs[NCCL_NET_MAX_DEVS_PER_NIC]; + int nUserIfs = parseStringList(semi, userIfs, NCCL_NET_MAX_DEVS_PER_NIC); + if (nUserIfs == 0) { + INFO(NCCL_NET, "NET/IB : Invalid NCCL_NET_FORCE_MERGE specified %s. Couldn't parse substring %s. Please provide a semicolon-delimited list of comma-delimited NIC groups.", + str, semi); + continue; + } + + ncclNetVDeviceProps_t vProps = {0}; + for (int d = 0; d < nPhysDevs; d++) { + if (matchIfList(propsList[d].name, propsList[d].port, userIfs, nUserIfs, 1)) { + vProps.devs[vProps.ndevs++] = d; + } + } + + if (vProps.ndevs != nUserIfs) { + WARN("TOPO/NET : Only matched %d devices, %d requested from %s", + vProps.ndevs, nUserIfs, semi); + return ncclInvalidUsage; + } + + if (vProps.ndevs > NCCL_NET_MAX_DEVS_PER_NIC) { + WARN("Specified fused NIC %s which has too many devices (%d). Max %d", semi, vProps.ndevs, NCCL_NET_MAX_DEVS_PER_NIC); + return ncclInvalidUsage; + } + + struct ncclXmlNode* netNode; + NCCLCHECK(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice)); + + // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this) + for (int i = 0; i < vProps.ndevs; i++) { + placedDevs[vProps.devs[i]] = 1; + } + + semi = strtok_r(NULL, ";", &semi_token);; + } + + return ncclSuccess; +} + +ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) { + // Compute the path type between each device + int* paths = NULL; + ncclResult_t res = ncclSuccess; + ncclCalloc(&paths, nPhysDevs*nPhysDevs); + TRACE(NCCL_GRAPH, "Allocated %d paths", nPhysDevs*nPhysDevs); + for (int i = 0; i < nPhysDevs; i++) { + for (int j = 0; j < nPhysDevs; j++) { + struct ncclXmlNode* nodes[2]; + nodes[0] = physNetNodes[i]; + nodes[1] = physNetNodes[j]; + struct ncclXmlNode* parent; + NCCLCHECKGOTO(ncclTopoGetPath(nodes, 2, &paths[i*nPhysDevs + j], &parent), res, out); + } + } + + // Place all remaining physical devices into a virtual device given the mergeLevel criteria + for (int i = 0; i < nPhysDevs; i++) { + // Select the first unplaced device "i" as the root + if (placedDevs[i] == 0) { + // Init a new vDevice + ncclNetVDeviceProps_t vProps; + vProps = {0}; + vProps.devs[vProps.ndevs++] = i; + placedDevs[i] = 1; + TRACE(NCCL_GRAPH, "Placed dev %d", i); + + // Select each unplaced device "j" which is at most "mergeLevel" distance from "i", but not equal to "i" + // (Don't merge the same device with itself) + for (int j = 0; j < nPhysDevs; j++) { + if (paths[i*nPhysDevs + j] <= mergeLevel && + placedDevs[j] == 0 && j != i) { + vProps.devs[vProps.ndevs++] = j; + placedDevs[j] = 1; + TRACE(NCCL_GRAPH, "Placed dev %d path=%d", j, paths[i*nPhysDevs + j] ); + } + if (vProps.ndevs == NCCL_NET_MAX_DEVS_PER_NIC) break; + } + + if (vProps.ndevs > NCCL_NET_MAX_DEVS_PER_NIC) { + WARN("TOPO/NET : Tried to merge too many NICs. %d > %d", vProps.ndevs, NCCL_NET_MAX_DEVS_PER_NIC); + return ncclInternalError; + } + + struct ncclXmlNode* netNode; + NCCLCHECKGOTO(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice), res, out); + } + } + +out: + free(paths); + return res; +} + +struct kvDict nicPathKvList[] = { + { "LOC", PATH_LOC }, + { "PORT", PATH_PORT }, + { "PIX", PATH_PIX }, + { "PXB", PATH_PXB }, + { "PXN", PATH_PXN }, + { "PHB", PATH_PHB }, + { "SYS", PATH_SYS }, + { NULL, 0 } +}; + +ncclResult_t ncclTopoGetVNicParent(struct ncclXml* xml, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclNetVDeviceProps_t* vProps, ncclXmlNode** parent) { + ncclNetProperties_t props[NCCL_NET_MAX_DEVS_PER_NIC]; + ncclXmlNode* physNetNodes[NCCL_NET_MAX_DEVS_PER_NIC]; + for (int i = 0; i < vProps->ndevs; i++) { + NCCLCHECK(getProperties(vProps->devs[i], props + i)); + struct ncclXmlNode* physNetNode; + NCCLCHECK(xmlFindTagKv(xml, "net", &physNetNode, "name", props[i].name)); + physNetNodes[i] = physNetNode; + TRACE(NCCL_GRAPH, "Re-found physical ncclNet node %d %s", i, props[i].name); + } + + int path = PATH_LOC; + NCCLCHECK(ncclTopoGetPath(physNetNodes, vProps->ndevs, &path, parent)); + if (path == PATH_LOC) { + *parent = NULL; + } else if (parent && strcmp((*parent)->name, "pci") == 0) { + // If the common parent is PCI, we must reparent the new NIC under a made up busId + NCCLCHECK(ncclTopoMakePciParent(xml, parent, physNetNodes[0])); + } + TRACE(NCCL_GRAPH, "Selected parent %s with path %d", (*parent)->name, path); + return ncclSuccess; +} + +ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*getProperties)(int, ncclNetProperties_t*), int physicalDevs) { + int* placedDevs = NULL; + struct ncclXmlNode** physNetNodes = NULL; + if (physicalDevs == 0) return ncclSuccess; + + ncclCalloc(&physNetNodes, physicalDevs); + ncclResult_t res = ncclSuccess; + + ncclNetProperties_t* props = NULL; + ncclCalloc(&props, physicalDevs); + for (int i = 0; i < physicalDevs; i++) { + NCCLCHECKGOTO(getProperties(i, props + i), res, out); + struct ncclXmlNode* physNetNode; + NCCLCHECKGOTO(xmlFindTagKv(xml, "net", &physNetNode, "name", props[i].name), res, out); + physNetNodes[i] = physNetNode; + TRACE(NCCL_GRAPH, "Found physical ncclNet node %d %s", i, props[i].name); + } + + // By default, don't merge any devices + int mergeLevel; + mergeLevel = PATH_PORT; + char* mergeLevelEnv; + mergeLevelEnv = getenv("NCCL_NET_MERGE_LEVEL"); + if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList); + char* forceMerge; + forceMerge = getenv("NCCL_NET_FORCE_MERGE"); + NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs)); + memset(placedDevs, 0, sizeof(int)*physicalDevs); + + if (forceMerge) { + NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out); + } + NCCLCHECKGOTO(ncclTopoAutoMerge(comm, xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out); + +out: + free(physNetNodes); + free(props); + if (placedDevs) free(placedDevs); + return res; +} + +static ncclResult_t ncclTopoPopulateNics(ncclComm_t comm, ncclXml* xml, int startIndex, int endIndex, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), const char* netName, int coll, int keep, int virtualNics) { + for (int n = startIndex; n < endIndex; n++) { + ncclNetProperties_t props; + NCCLCHECK(getProperties(n, &props)); + struct ncclXmlNode* netNode = NULL; + struct ncclXmlNode* parent = NULL; + if (virtualNics) { + struct ncclXmlNode* net = NULL; + NCCLCHECK(xmlFindTagKv(xml, "net", &net, "name", props.name)); + // In the event of multithreaded use case, we need to re-discover the shared parent of the given devices for this vNIC + // Only run this if the net doesn't exist locally - this may alter the XML state + if (net == NULL) NCCLCHECK(ncclTopoGetVNicParent(xml, getProperties, &props.vProps, &parent)); + } + + NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode, parent)); + + const char* colAttr; + NCCLCHECK(xmlGetAttr(netNode, "coll", &colAttr)); + + // If coll == 0 but the netNode is tagged as coll, don't update the keep value + if (colAttr == NULL || coll != 0 || strcmp(colAttr,"1") != 0) NCCLCHECK(xmlSetAttrInt(netNode, "keep", keep)); + NCCLCHECK(xmlSetAttrInt(netNode, "dev", n)); + NCCLCHECK(xmlInitAttrInt(netNode, "latency", props.latency)); + NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed)); + NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port)); + NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid)); + NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms)); + bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF)); + INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", netName, gdrSupport ? "Enabled" : "Disabled", n, props.name); + NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport)); + // Only set coll if it's not 0 + if (coll) NCCLCHECK(xmlInitAttrInt(netNode, "coll", coll)); + + const char* keepAttr; + NCCLCHECK(xmlGetAttr(netNode, "coll", &colAttr)); + NCCLCHECK(xmlGetAttr(netNode, "keep", &keepAttr)); + INFO(NCCL_GRAPH, "ncclTopoPopulateNics : Filled %s in topo with pciPath=%s keep=%s coll=%s", + props.name, props.pciPath, keepAttr, colAttr); + } + + return ncclSuccess; +} + +struct ncclTopoNetState { + int nVirtualNics; + int nPhysicalNics; + const char* name; +}; + +// Calls to network plugin APIs should be protected. This function should be called inside a per-process lock. +static ncclResult_t ncclTopoProcessNet(ncclComm_t comm, ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName) { + int usePhysicalDevices = (dumpXmlFile || makeVDevice == NULL); + if (state->nPhysicalNics == -1) NCCLCHECK(devices(&state->nPhysicalNics)); + // Enumerate physical devices + NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, getProperties, netName, coll, 1, 0)); + if (!usePhysicalDevices) { + if (state->nVirtualNics == -1) { + NCCLCHECK(ncclTopoMakeVNics(comm, xml, makeVDevice, getProperties, state->nPhysicalNics)); + int nDevs; + NCCLCHECK(devices(&nDevs)); + state->nVirtualNics = nDevs - state->nPhysicalNics; + } + // Remove keep=1 for physical collnets + if (state->nVirtualNics > 0) { + NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, getProperties, netName, coll, 0, 0)); + // Populate new devices + NCCLCHECK(ncclTopoPopulateNics(comm, xml, state->nPhysicalNics, state->nPhysicalNics+state->nVirtualNics, getProperties, netName, coll, 1, 1)); + } + } + + return ncclSuccess; +} + +static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER; +ncclTopoNetState netStates[NCCL_NET_MAX_PLUGINS] = {}; +ncclTopoNetState collNetStates[NCCL_NET_MAX_PLUGINS] = {}; +ncclResult_t ncclTopoGetSharedState(ncclTopoNetState** state, const char* name, ncclTopoNetState* states) { + INFO(NCCL_GRAPH, "Retrieving state for %s", name); + for (int i = 0; i < NCCL_NET_MAX_PLUGINS; i++) { + // Empty slot + if (states[i].name == NULL) { + states[i].nVirtualNics = -1; + states[i].nPhysicalNics = -1; + states[i].name = strdup(name); + *state = states + i; + INFO(NCCL_GRAPH, "Initialized state %d for %s", i, name); + return ncclSuccess; + // Found my slot + } else if (strcmp(states[i].name, name) == 0) { + *state = states + i; + return ncclSuccess; + } + } + WARN("NET/TOPO : Couldn't find net with name %s", name); + return ncclInternalError; +} + +ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system, const char* dumpXmlFile) { ncclResult_t ret = ncclSuccess; struct ncclXml* xml; char* mem = NULL; int* localRanks = NULL; - int netDevCount = 0; struct ncclXml* rankXml; int localRank = -1, nLocalRanks = 0; + int netLockHeld = 0; NCCLCHECK(xmlAlloc(&xml, NCCL_TOPO_XML_MAX_NODES)); const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE"); if (xmlTopoFile) { @@ -761,47 +1278,24 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy NCCLCHECKGOTO(xmlSetAttrInt(node, "rank", comm->rank), ret, fail); NCCLCHECKGOTO(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport), ret, fail); } + // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes, // so we start with collnet so that it has precedence. + pthread_mutex_lock(&netLock); + netLockHeld = 1; + INFO(NCCL_GRAPH, "TOPO/NET : Importing network plugins to topology"); + ncclTopoNetState* state; + state = NULL; if (collNetSupport(comm)) { - NCCLCHECKGOTO(collNetDevices(comm, &netDevCount), ret, fail); - for (int n=0; ndmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF)); - INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name); - NCCLCHECKGOTO(xmlInitAttrInt(netNode, "gdr", gdrSupport), ret, fail); - NCCLCHECKGOTO(xmlInitAttrInt(netNode, "coll", 1), ret, fail); - } - } - if (netDevCount == 0) { - NCCLCHECKGOTO(comm->ncclNet->devices(&netDevCount), ret, fail); - } - for (int n=0; nncclNet->getProperties(n, &props), ret, fail); - comm->netDeviceType = props.netDeviceType; - struct ncclXmlNode* netNode; - NCCLCHECKGOTO(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode), ret, fail); - NCCLCHECKGOTO(xmlSetAttrInt(netNode, "keep", 1), ret, fail); - NCCLCHECKGOTO(xmlSetAttrInt(netNode, "dev", n), ret, fail); - NCCLCHECKGOTO(xmlInitAttrInt(netNode, "speed", props.speed), ret, fail); - NCCLCHECKGOTO(xmlInitAttrInt(netNode, "port", props.port), ret, fail); - NCCLCHECKGOTO(xmlInitAttrFloat(netNode, "latency", props.latency), ret, fail); - NCCLCHECKGOTO(xmlInitAttrUint64(netNode, "guid", props.guid), ret, fail); - NCCLCHECKGOTO(xmlInitAttrInt(netNode, "maxconn", props.maxComms), ret, fail); - bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF)); - INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name); - NCCLCHECKGOTO(xmlInitAttrInt(netNode, "gdr", gdrSupport), ret, fail); + NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclCollNet->name, collNetStates), ret, fail); + NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 1, dumpXmlFile, state, + comm->ncclCollNet->getProperties, comm->ncclCollNet->makeVDevice, comm->ncclCollNet->devices, comm->ncclCollNet->name), ret, fail); } + NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclNet->name, netStates), ret, fail); + NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 0, dumpXmlFile, state, + comm->ncclNet->getProperties, comm->ncclNet->makeVDevice, comm->ncclNet->devices, comm->ncclNet->name), ret, fail); + pthread_mutex_unlock(&netLock); + netLockHeld = 0; // Remove XML branches which don't have a node with keep="1" (typically when importing a topology) NCCLCHECKGOTO(ncclTopoTrimXml(xml), ret, fail); @@ -845,19 +1339,21 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy NCCLCHECKGOTO(ncclTopoFuseXml(xml, peerXml), ret, fail); } - xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE"); - if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) { - INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile); - NCCLCHECKGOTO(ncclTopoDumpXmlToFile(xmlTopoFile, xml), ret, fail); + if (dumpXmlFile && comm->rank == ncclParamTopoDumpFileRank()) { + INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", dumpXmlFile); + NCCLCHECKGOTO(ncclTopoDumpXmlToFile(dumpXmlFile, xml), ret, fail); } - NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail); + // Only update our topo tracking structure if we aren't dumping (separate steps) + if (dumpXmlFile == NULL) NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail); + exit: if (!comm->MNNVL && localRanks) free(localRanks); if (mem) free(mem); free(xml); return ret; fail: + if (netLockHeld) pthread_mutex_unlock(&netLock); goto exit; } diff --git a/src/graph/topo.h b/src/graph/topo.h index 0837fb4b3..8e7cda5b4 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -78,6 +78,9 @@ extern const char* topoLinkTypeStr[]; // Connection through the network #define PATH_NET 8 +// New type of path which should precede PATH_PIX +#define PATH_PORT PATH_NVL + // Disconnected #define PATH_DIS 9 extern const char* topoPathTypeStr[]; @@ -106,6 +109,7 @@ struct ncclTopoLinkList { #define NCCL_TOPO_ID_LOCAL_ID_MASK 0x00ffffffffffffff #define NCCL_TOPO_ID_SYSTEM_ID(id) (id >> 56) #define NCCL_TOPO_ID_LOCAL_ID(id) (id & NCCL_TOPO_ID_LOCAL_ID_MASK) +#define NCCL_TOPO_LOCAL_NIC_ID(numaid, busid) (((int64_t)numaid << 56) + busid) #define NCCL_TOPO_ID(systemid, localid) (((int64_t)systemid << 56) + (localid & NCCL_TOPO_ID_LOCAL_ID_MASK)) struct ncclTopoNode { diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index f0a622452..f5f2e1185 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -31,23 +31,87 @@ static int getNthreads(const char* name, int env, int min, int max, int def) { return nt; } -ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* list) { - int def, set; - if (str[0] == '^') { - def = 1; set = 0; str++; - } else { - def = 0; set = 1; - } - for (int i=0; i= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX; - int cpuArch, cpuVendor, cpuModel; - NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel)); int index2 = nNodes <= 2 ? nNodes-1 : 2; // LL: for single node, we look at GPU type; for multi-node, we look at CPU type - int index1 = nNodes == 1 ? compCapIndex : cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0; + int index1 = nNodes == 1 ? compCapIndex : + (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD || comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) ? 1 : 0; double llMaxBw = llMaxBws[index1][index2]; double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2]; double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2]; double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2]; // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring - if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]; + if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]; float ppn = (float)nRanks / nNodes; int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS]; @@ -190,7 +253,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw); if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw); if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85; - if (a == NCCL_ALGO_PAT) busBw *= .85; + if (a == NCCL_ALGO_PAT) busBw *= .75; if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) { @@ -226,10 +289,6 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom busBw *= ratio; } comm->bandwidths[coll][a][p] = busBw; - /* Ring bandwidth backup */ - if (a == NCCL_ALGO_RING) - comm->ringbdw[coll][p] = comm->bandwidths[coll][NCCL_ALGO_RING][p]; - comm->latencies[coll][a][p] = baseLat[a][p]; float intraLat = hwLat[intraHw[a]][a][p]; // With ppn=1 latencies are fully exposed, use the Tree network latency @@ -286,41 +345,78 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom // Protocols/Algorithms enable/disable, and user overrides. // All are enabled except ll128 which is enabled by default only in certain cases. - int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 }; - int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1, 1 }; + int protoEnable[NCCL_NUM_FUNCTIONS*NCCL_NUM_PROTOCOLS]; + int algoEnable[NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS]; + for (int f=0; fnNodes == 1) algoEnable[NCCL_ALGO_NVLS_TREE] = 0; - - // Disable CollNet if it is not supported - if (comm->collNetSupport == 0) { - algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0; - algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0; - if (nNodes > 1) algoEnable[NCCL_ALGO_NVLS] = 0; - // If user has hard set NCCL_ALGO=COLLNET, ignore it - if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0 && - algoEnable[NCCL_ALGO_NVLS] == 0 && algoEnable[NCCL_ALGO_NVLS_TREE] == 0) { - algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1; + if (comm->rank == 0 && (algoStr||protoStr)) { + constexpr int strLength = 1024; + char funcAlgoProtoTuningStr[strLength]; + int offset = 0; + offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "\n Function | "); + for (int p=0; ptopo, &nvsCount)); + + for (int f=0; fnNodes == 1 && a == NCCL_ALGO_NVLS_TREE) disable = 1; + // Disable Collnet+Direct, Collnet+Chain or Collnet+NVLS if collnet is not supported. + if (comm->collNetSupport == 0 && + (a == NCCL_ALGO_COLLNET_DIRECT || + a == NCCL_ALGO_COLLNET_CHAIN || + (a == NCCL_ALGO_NVLS && comm->nNodes > 1))) disable = 1; + // Disable CollNet+Direct if not on an NVSwitch system + if (nvsCount == 0 && a == NCCL_ALGO_COLLNET_DIRECT) disable = 1; + if (disable) algoEnable[f*NCCL_NUM_ALGORITHMS+a] = 0; } - } else { - // Disable CollNet+Direct if not on an NVSwitch system - int nvsCount = 0; - NCCLCHECK(ncclTopoGetNvsCount(comm->topo, &nvsCount)); - if (nvsCount == 0) algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0; } for (int c=0; cbandwidths[c][a][p] = 0; - if (algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0; - if (a == NCCL_ALGO_RING && pEnable == 0) comm->ringbdw[c][p] = 0; - } - - for (int c = 0; c < NCCL_NUM_FUNCTIONS; c++) { - bool available = false; - for (int a = 0; a < NCCL_NUM_ALGORITHMS; a++) - for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) - if (comm->bandwidths[c][a][p] != 0) { - available = true; - goto check_avail; - } - check_avail: - if (available == false) { - /* at least set ring algo available */ - for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) - comm->bandwidths[c][NCCL_ALGO_RING][p] = comm->ringbdw[c][p]; - } + if (algoEnable[c*NCCL_NUM_ALGORITHMS+a] == 0) comm->bandwidths[c][a][p] = 0; } if (comm->rank == 0) { - char line[1024]; + constexpr int lineLen = 1024; + char line[lineLen]; + int offset = 0; for (int block=0; block= NCCL_NUM_ALGORITHMS) continue; - sprintf(line+strlen(line), " %14s %14s %14s |", "", ncclAlgoStr[a], ""); + offset += snprintf(line+offset, std::max(0, lineLen-offset), " %14s %14s %14s |", "", ncclAlgoStr[a], ""); } INFO(NCCL_TUNING, "%s", line); - sprintf(line, " Protocol |"); + offset = snprintf(line, lineLen, " Protocol |"); for (int ba=0; ba<3; ba++) { for (int p=0; p= NCCL_NUM_ALGORITHMS) continue; for (int p=0; pmaxThreads[a][p]); + offset += snprintf(line+offset, std::max(0, lineLen-offset), " %14d |", comm->maxThreads[a][p]); } } INFO(NCCL_TUNING, "%s", line); for (int c=0; c= NCCL_NUM_ALGORITHMS) continue; for (int p=0; platencies[c][a][p], comm->bandwidths[c][a][p]); + offset += snprintf(line+offset, std::max(0, lineLen-offset), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]); } } INFO(NCCL_TUNING, "%s", line); } } } - + // Set per-thread amount of work before we increase nThreads and nChannels for (int a=0; athreadThresholds[a][NCCL_PROTO_LL] = NCCL_LL_THREAD_THRESHOLD; @@ -438,19 +519,10 @@ static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = { { .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .6, .7, .8, .7, .7, .8, .9, .9 } }; -ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time, bool* backup) { +ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time) { float bw = comm->bandwidths[coll][algorithm][protocol]; float lat = comm->latencies[coll][algorithm][protocol]; - if (backup) { - *backup = false; - if (algorithm == NCCL_ALGO_RING && bw == 0.0f) { - /* try back up RING algorithm */ - bw = comm->ringbdw[coll][protocol]; - *backup = true; - } - } - if (bw == 0) { *time = -1.0; return ncclSuccess; } diff --git a/src/graph/xml.cc b/src/graph/xml.cc index bb123b798..a41289389 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -17,6 +17,9 @@ #include #endif +// Arbitrarily large number for constructing virtual topology string +#define NCCL_MAX_XML_DEPTH 1024 + /*******************/ /* XML File Parser */ /*******************/ @@ -430,7 +433,7 @@ static ncclResult_t getBcmLinks(const char* busId, int* nlinks, char** peers) { ncclResult_t ncclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue) { char filePath[PATH_MAX]; - sprintf(filePath, "%s/%s", path, fileName); + snprintf(filePath, sizeof(filePath), "%s/%s", path, fileName); int offset = 0; FILE* file; if ((file = fopen(filePath, "r")) != NULL) { @@ -883,7 +886,7 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl // where sysPath/subsystem points to. ncclResult_t ncclTopoGetSubsystem(const char* sysPath, char* subSys) { char subSysPath[PATH_MAX]; - sprintf(subSysPath, "%s/subsystem", sysPath); + snprintf(subSysPath, sizeof(subSysPath), "%s/subsystem", sysPath); char* path = realpath(subSysPath, NULL); if (path == NULL) { subSys[0] = '\0'; @@ -896,8 +899,9 @@ ncclResult_t ncclTopoGetSubsystem(const char* sysPath, char* subSys) { return ncclSuccess; } -ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode) { +ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode, struct ncclXmlNode* forceParent) { NCCLCHECK(xmlFindTagKv(xml, "net", netNode, "name", netName)); + if (*netNode != NULL) return ncclSuccess; const char* pciSysPath = pciPath; @@ -906,13 +910,15 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha NCCLCHECK(ncclTopoGetSubsystem(pciSysPath, subSystem)); // This is not a PCI device (virtual, usb, ...). if (strcmp(subSystem, "pci") != 0) { - INFO(NCCL_GRAPH, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem); + INFO(NCCL_NET|NCCL_GRAPH, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem); pciSysPath = NULL; } } struct ncclXmlNode* parent = NULL; - if (pciSysPath) { + if (forceParent) { + parent = forceParent; + } else if (pciSysPath) { int offset; for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--); char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; diff --git a/src/graph/xml.h b/src/graph/xml.h index 0ee56790b..f06c0e68b 100644 --- a/src/graph/xml.h +++ b/src/graph/xml.h @@ -50,7 +50,7 @@ ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXm /* Auto-detect functions */ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode); -ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode); +ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode, struct ncclXmlNode* forceParent=NULL); /* Remove unneeded parts */ ncclResult_t ncclTopoTrimXml(struct ncclXml* xml); @@ -132,6 +132,13 @@ static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrNa return ncclSuccess; } +static ncclResult_t xmlGetAttrFloatDefault(struct ncclXmlNode* node, const char* attrName, float* value, float defaultValue) { + const char* str; + NCCLCHECK(xmlGetAttr(node, attrName, &str)); + *value = str ? strtof(str, NULL) : defaultValue; + return ncclSuccess; +} + static ncclResult_t xmlFindTag(struct ncclXml* xml, const char* tagName, struct ncclXmlNode** node) { *node = NULL; for (int i=0; imaxIndex; i++) { @@ -208,6 +215,24 @@ static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, c return ncclSuccess; } +static ncclResult_t xmlPrintNodeRecursive(struct ncclXmlNode* node, const char* name) { + while (node) { + char line[1024*8]; + int cursor = 0; + snprintf(line, sizeof(line), "name); + for (int i = 0; i < node->nAttrs; i++) { + cursor = strlen(line); + snprintf(line + cursor, sizeof(line) - cursor, " %s=%s", node->attrs[i].key, node->attrs[i].value); + } + cursor = strlen(line); + snprintf(line + cursor, sizeof(line) - cursor, ">"); + INFO(NCCL_GRAPH, "%s", line); + node = node->parent; + } + return ncclSuccess; +} + + static ncclResult_t xmlSetAttrIfUnset(struct ncclXmlNode* node, const char* attrName, const char* value) { int index; NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); diff --git a/src/group.cc b/src/group.cc index 3d3ecb88c..e387db70c 100644 --- a/src/group.cc +++ b/src/group.cc @@ -323,7 +323,7 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g /* reset everything */ while (!ncclIntruQueueEmpty(asyncJobsPtr)) { struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsPtr); - if (job->comm && !job->comm->config.blocking) + if (!job->destroyFlag && job->comm && !job->comm->config.blocking) (void) ncclCommSetAsyncError(job->comm, error); if (job->undo) job->undo(job); if (job->destructor) job->destructor((void*)job); @@ -392,7 +392,6 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueuegroupCommHeadPtr; @@ -401,8 +400,6 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf bool *groupAbortFlag = gjob->abortFlagPtr; - CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail); - if (!simInfo && groupCommPreconnectHeadMain != nullptr) { struct ncclComm* comm = groupCommPreconnectHeadMain; do { @@ -454,12 +451,19 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf } comm = comm->groupNext; } while (comm); - NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail); while (!ncclIntruQueueEmpty(&asyncCollJobs)) { struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs); if (job->destructor) job->destructor((void*)job); } + + // done with all buffer allocation, start registration and enqueue + comm = groupCommHeadMain; + do { + CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail); + NCCLCHECKGOTO(ncclTasksRegAndEnqueue(comm), ret, fail); + comm = comm->groupNext; + } while (comm); } if ((!simInfo) && (groupCommHeadMain != nullptr)) { @@ -476,6 +480,9 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf while (groupCommHeadMain != nullptr) { struct ncclComm* comm = groupCommHeadMain; struct ncclComm* next = comm->groupNext; + // Poll for callbacks sent to us from other threads. Typically these free + // resources from to our memory pools and UB + NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/false), ret, fail); (void) ncclGroupCommLeave(comm); if (!comm->config.blocking) { (void) ncclCommSetAsyncError(comm, ret); @@ -483,8 +490,6 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf groupCommHeadMain = next; } - CUDACHECK(cudaSetDevice(savedDev)); - exit: return ret; fail: @@ -563,7 +568,10 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) { ret = ncclInProgress; } else { /* blocking group */ + int savedDev; + CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail); NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base, internalSimInfoPtr), ret, fail); + CUDACHECKGOTO(cudaSetDevice(savedDev), ret, fail); if (simInfo) memcpy((void*)simInfo, (void*)internalSimInfoPtr, realSize); groupResetJobState(ncclGroupJobMainPtr); } diff --git a/src/include/collectives.h b/src/include/collectives.h index e45d78f26..c82ebce6f 100644 --- a/src/include/collectives.h +++ b/src/include/collectives.h @@ -10,6 +10,7 @@ #include "nccl.h" #include "nccl_common.h" #include "device.h" +#define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two. // CHUNKSIZE must be a multiple of SLICESIZE #define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4) @@ -23,6 +24,7 @@ #define REDUCE_SLICESTEPS 1 #define REDUCE_CHUNKSTEPS 1 #define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above +#define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two. const char* ncclFuncToString(ncclFunc_t op); const char* ncclDevRedOpToString(ncclDevRedOp_t op); @@ -34,11 +36,11 @@ inline int ncclTypeSize(ncclDataType_t type) { switch (type) { case ncclInt8: case ncclUint8: + case ncclFloat8e4m3: + case ncclFloat8e5m2: return 1; case ncclFloat16: - #if defined(__CUDA_BF16_TYPES_EXIST__) case ncclBfloat16: - #endif return 2; case ncclInt32: case ncclUint32: @@ -67,6 +69,319 @@ struct ncclConnFifo { #include +class RingAlgorithm { +protected: + int refCount; + int nRanks; + int nStepsPerLoop; + int chunkSteps; + int sliceSteps; + ssize_t sliceSize; + ssize_t loopSize; + ssize_t channelSize; + uint8_t *sendbuff; + uint8_t *recvbuff; + void *sendMhandle; + void *recvMhandle; + void *srecvMhandle; +public: + // this ring class is used by proxy thread to retrieve the send and recv buffer, size as well as corresponding + // mem handle based on the current step of the proxy args. The derived ring algo class is AR, AG, and BC which + // would be allocated during enqueue stage and copied to proxy side through shared memory. For each copy, we will + // increase the refCount by incRefCount() since the same ring algo object can be referenced multiple times for send + // and recv progress. After all steps are done, we decrease the refCount and only delete the ring object when + // refCount == 0. + virtual void getNextSendAddr(int curStep, uint8_t **sendbuffOut, size_t *sizeOut, void **mhandleOut) = 0; + virtual void getNextRecvAddr(int curStep, uint8_t **recvbuffOut, size_t *sizeOut, void **mhandleOut) = 0; + int incRefCount() { + return __atomic_add_fetch(&refCount, 1, __ATOMIC_RELAXED); + } + int decRefCount() { + return __atomic_sub_fetch(&refCount, 1, __ATOMIC_RELEASE); + } + RingAlgorithm() { refCount = 0; } + virtual ~RingAlgorithm() {}; +}; + +class RingARAlgorithm : public RingAlgorithm { +private: + int ringIndex; + int elemSize; + ssize_t chunkSize; + int slicePerChunk; +public: + void getNextSendAddr(int curStep, uint8_t **sendbuffOut, size_t *sizeOut, void **mhandleOut) { + int curLoop = curStep / nStepsPerLoop; + int curLoopStage = (curStep % nStepsPerLoop) / chunkSteps; + int chunkStage = curLoopStage % nRanks; + int sliceStage = (curStep % chunkSteps) / sliceSteps; + ssize_t elemOffset = curLoop * loopSize; + ssize_t remSize = channelSize - elemOffset; + ssize_t chunkOffset; + ssize_t sliceOffset; + ssize_t curSliceSize; + ssize_t curChunkSize; + ssize_t size; + ssize_t nelem; + int chunkId; + + if (remSize < loopSize) { + curChunkSize = alignUp(divUp(remSize / elemSize, nRanks), 16 / elemSize) * elemSize; + } else { + curChunkSize = chunkSize; + } + chunkId = (ringIndex + nRanks - 1 - chunkStage) % nRanks; + chunkOffset = chunkId * curChunkSize; + nelem = std::min(remSize - chunkOffset, curChunkSize); + curSliceSize = std::max(divUp(nelem / elemSize, 16 * slicePerChunk) * 16, sliceSize / elemSize / 32) * elemSize; + sliceOffset = sliceStage * curSliceSize; + + if (nelem <= sliceOffset) { + *sendbuffOut = sendbuff; + *mhandleOut = sendMhandle; + } else { + if (curLoopStage == 0) { + *sendbuffOut = sendbuff + elemOffset + chunkOffset + sliceOffset; + *mhandleOut = sendMhandle; + } else { + *sendbuffOut = recvbuff + elemOffset + chunkOffset + sliceOffset; + *mhandleOut = srecvMhandle; + } + } + size = std::min(curSliceSize, nelem - sliceOffset); + *sizeOut = size < 0 ? 0 : size; + return; + } + + void getNextRecvAddr(int curStep, uint8_t **recvbuffOut, size_t *sizeOut, void **mhandleOut) { + int curLoop = curStep / nStepsPerLoop; + int curLoopStage = ((curStep + chunkSteps) % nStepsPerLoop) / chunkSteps; + int chunkStage = curLoopStage % nRanks; + int sliceStage = (curStep % chunkSteps) / sliceSteps; + ssize_t elemOffset = curLoop * loopSize; + ssize_t remSize = channelSize - elemOffset; + ssize_t chunkOffset; + ssize_t sliceOffset; + ssize_t curSliceSize; + ssize_t curChunkSize; + ssize_t size; + ssize_t nelem; + int chunkId; + + if (remSize < loopSize) { + curChunkSize = alignUp(divUp(remSize / elemSize, nRanks), 16 / elemSize) * elemSize; + } else { + curChunkSize = chunkSize; + } + + if (curLoopStage == 0) { + chunkId = (ringIndex + 1) % nRanks; + } else { + chunkId = (ringIndex + nRanks - 1 - chunkStage) % nRanks; + } + + chunkOffset = chunkId * curChunkSize; + nelem = std::min(remSize - chunkOffset, curChunkSize); + curSliceSize = std::max(divUp(nelem / elemSize, 16 * slicePerChunk) * 16, sliceSize / elemSize / 32) * elemSize; + sliceOffset = sliceStage * curSliceSize; + if (nelem <= sliceOffset) { + *recvbuffOut = recvbuff; + } else { + *recvbuffOut = recvbuff + elemOffset + chunkOffset + sliceOffset; + } + if (sizeOut) { + size = std::min(curSliceSize, nelem - sliceOffset); + *sizeOut = size < 0 ? 0 : size; + } + *mhandleOut = recvMhandle; + return; + } + + RingARAlgorithm(const void *sendbuff, void *recvbuff, int nRanks, int ringIndex, int chunkSteps, int sliceSteps, size_t chunkSize, size_t sliceSize, size_t gridOffset, size_t channelSize, int elemSize, void *sendMhandle, void *recvMhandle, void *srecvMhandle) { + this->ringIndex = ringIndex; + this->nRanks = nRanks; + this->nStepsPerLoop = 2 * (nRanks - 1) * chunkSteps; + this->chunkSteps = chunkSteps; + this->sliceSteps = sliceSteps; + this->chunkSize = chunkSize; + this->sliceSize = sliceSize; + this->loopSize = nRanks * chunkSize; + this->sendbuff = (uint8_t*)sendbuff + gridOffset; + this->recvbuff = (uint8_t*)recvbuff + gridOffset; + this->channelSize = channelSize; + this->elemSize = elemSize; + this->sendMhandle = sendMhandle; + this->recvMhandle = recvMhandle; + this->srecvMhandle = srecvMhandle; + this->slicePerChunk = chunkSteps / sliceSteps; + } + ~RingARAlgorithm() {} +}; + +class RingAGAlgorithm : public RingAlgorithm { +private: + int *ringRanks; + int elemSize; + ssize_t sendSize; + int slicePerChunk; +public: + void getNextSendAddr(int curStep, uint8_t **sendbuffOut, size_t *sizeOut, void **mhandleOut) { + int curLoop = curStep / nStepsPerLoop; + int chunkStage = (curStep % nStepsPerLoop) / chunkSteps; + int sliceStage = (curStep % chunkSteps) / sliceSteps; + ssize_t sliceOffset; + ssize_t curSliceSize; + ssize_t offset; + ssize_t elemOffset = curLoop * loopSize; + ssize_t chunkSize = std::min(loopSize, channelSize - elemOffset); + ssize_t size; + int rankDest; + uint8_t *buff; + void *mhandle; + + curSliceSize = std::max(divUp(chunkSize / elemSize, 16 * slicePerChunk) * 16, sliceSize / elemSize / 32) * elemSize; + sliceOffset = sliceStage * curSliceSize; + if (chunkStage == 0) { + rankDest = ringRanks[0]; + offset = elemOffset + sliceOffset; + buff = sendbuff + offset; + mhandle = sendMhandle; + } else { + rankDest = ringRanks[nRanks - chunkStage]; + offset = elemOffset + rankDest * sendSize + sliceOffset; + buff = recvbuff + offset; + mhandle = srecvMhandle; + } + *sendbuffOut = buff; + size = std::min(curSliceSize, channelSize - elemOffset - sliceOffset); + *sizeOut = size < 0 ? 0 : size; + *mhandleOut = mhandle; + return; + } + + void getNextRecvAddr(int curStep, uint8_t **recvbuffOut, size_t *sizeOut, void **mhandleOut) { + int curLoop = curStep / nStepsPerLoop; + int chunkStage = ((curStep + chunkSteps) % nStepsPerLoop) / chunkSteps; + int sliceStage = (curStep % chunkSteps) / sliceSteps; + ssize_t sliceOffset; + ssize_t curSliceSize; + ssize_t offset; + ssize_t elemOffset = curLoop * loopSize; + ssize_t chunkSize = std::min(loopSize, channelSize - elemOffset); + ssize_t size; + int rankDest; + + curSliceSize = std::max(divUp(chunkSize / elemSize, 16 * slicePerChunk) * 16, sliceSize / elemSize / 32) * elemSize; + sliceOffset = sliceStage * curSliceSize; + if (chunkStage == 0) { + rankDest = ringRanks[1]; + } else { + rankDest = ringRanks[nRanks - chunkStage]; + } + offset = elemOffset + rankDest * sendSize + sliceOffset; + *recvbuffOut = recvbuff + offset; + if (sizeOut) { + size = std::min(sliceSize, channelSize - elemOffset - sliceOffset); + *sizeOut = size < 0 ? 0 : size; + } + *mhandleOut = recvMhandle; + } + + RingAGAlgorithm(const void *sendbuff, void *recvbuff, int nRanks, int *ringRanks, int chunkSteps, int sliceSteps, size_t chunkSize, size_t sliceSize, size_t gridOffset, size_t channelSize, int elemSize, size_t sendSize, void *sendMhandle, void *recvMhandle, void *srecvMhandle) { + this->ringRanks = ringRanks; + this->nRanks = nRanks; + this->nStepsPerLoop = (nRanks - 1) * chunkSteps; + this->chunkSteps = chunkSteps; + this->sliceSteps = sliceSteps; + this->elemSize = elemSize; + this->sliceSize = sliceSize; + this->loopSize = chunkSize; + this->sendSize = sendSize; + this->channelSize = channelSize; + this->sendbuff = (uint8_t*)sendbuff + gridOffset; + this->recvbuff = (uint8_t*)recvbuff + gridOffset; + this->sendMhandle = sendMhandle; + this->recvMhandle = recvMhandle; + this->srecvMhandle = srecvMhandle; + this->slicePerChunk = chunkSteps / sliceSteps; + } + ~RingAGAlgorithm() {} +}; + +class RingBCAlgorithm : public RingAlgorithm { +private: + int root; + int rank; + int nextRank; +public: + void getNextSendAddr(int curStep, uint8_t **sendbuffOut, size_t *sizeOut, void **mhandleOut) { + int curLoop = curStep / nStepsPerLoop; + int sliceStage = (curStep % chunkSteps) / sliceSteps; + ssize_t sliceOffset = sliceStage * sliceSize; + ssize_t offset; + ssize_t elemOffset = curLoop * loopSize; + ssize_t size; + uint8_t *buff; + void *mhandle; + + offset = elemOffset + sliceOffset; + if (offset >= channelSize) { + buff = sendbuff; + mhandle = sendMhandle; + } else if (rank == root) { + buff = sendbuff + offset; + mhandle = sendMhandle; + } else { + buff = recvbuff + offset; + mhandle = srecvMhandle; + } + *sendbuffOut = buff; + size = std::min(sliceSize, channelSize - offset); + *sizeOut = size < 0 ? 0 : size; + *mhandleOut = mhandle; + return; + } + + void getNextRecvAddr(int curStep, uint8_t **recvbuffOut, size_t *sizeOut, void **mhandleOut) { + int curLoop = curStep / nStepsPerLoop; + int sliceStage = (curStep % chunkSteps) / sliceSteps; + ssize_t sliceOffset = sliceStage * sliceSize; + ssize_t offset; + ssize_t elemOffset = curLoop * loopSize; + ssize_t size; + offset = elemOffset + sliceOffset; + if (offset >= channelSize) { + *recvbuffOut = recvbuff; + } else { + *recvbuffOut = recvbuff + offset; + } + if (sizeOut) { + size = std::min(sliceSize, channelSize - offset); + *sizeOut = size < 0 ? 0 : size; + } + *mhandleOut = recvMhandle; + return; + } + + RingBCAlgorithm(const void* sendbuff, void* recvbuff, int rank, int root, int nRanks, int *ringRanks, int chunkSteps, int sliceSteps, size_t chunkSize, size_t sliceSize, size_t gridOffset, size_t channelSize, void *sendMhandle, void *recvMhandle, void *srecvMhandle) { + this->root = root; + this->rank = rank; + this->nextRank = ringRanks[1]; + this->nStepsPerLoop = chunkSteps; + this->chunkSteps = chunkSteps; + this->sliceSteps = sliceSteps; + this->sliceSize = sliceSize; + this->loopSize = chunkSize; + this->channelSize = channelSize; + this->sendbuff = (uint8_t*)sendbuff + gridOffset; + this->recvbuff = (uint8_t*)recvbuff + gridOffset; + this->sendMhandle = sendMhandle; + this->recvMhandle = recvMhandle; + this->srecvMhandle = srecvMhandle; + } + ~RingBCAlgorithm() {} +}; + template class PatRSAlgorithm{ size_t offset; @@ -532,10 +847,10 @@ class PatAGAlgorithm{ int sendDataRank = (rank + nranks + s) % nranks; outIx = sendDataRank * count + offset; recvDim = s ? firstBitSet(s, nrPow2) : -1; - s -= (1<> (recvDim+1); recvOffset = (foffset%postFreq)*nelem; recvStepOffset = foffset / postFreq; diff --git a/src/include/comm.h b/src/include/comm.h index 9d102dfed..c3f4eb49f 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -197,12 +197,15 @@ struct ncclTaskColl { int32_t algorithm:8, protocol:8; uint32_t isCollnet:1, isNvls:1; uint32_t devFuncId:30; - enum ncclRegBufferType regBufType; + int regBufType; // number of elements in planner->ipcMemQueue associated with this collective int nCleanupQueueElts; void* sendMhandle; void* recvMhandle; + void** sendNetHandles; + void** recvNetHandles; + void** srecvNetHandles; // index for IPC record lookup uintptr_t sendbuffOffset; uintptr_t recvbuffOffset; @@ -236,6 +239,7 @@ struct ncclKernelPlan { struct ncclKernelPlan* next; bool persistent; // aka captured in a graph + bool isHostCbEnq; enum ncclDevWorkStorageType workStorageType; bool kernelSpecialized; void *kernelFn; @@ -365,6 +369,7 @@ struct ncclKernelPlanner { struct ncclIntruQueue collTaskQueue; struct ncclIntruQueue collWorkQueue; + struct ncclIntruQueue tmpCollWorkQueue; struct ncclIntruQueue collCleanupQueue; ////////////////////////////////////////////////////////////////////////////// @@ -463,6 +468,8 @@ struct ncclComm { // Counter for tracking CUDA launches (P2P and collectives included) uint64_t opCount; + // Collective operation counter + uint64_t collOpCount; // Channels for collectives int nChannels; // connection nChannels @@ -486,7 +493,6 @@ struct ncclComm { ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - float ringbdw[NCCL_NUM_FUNCTIONS][NCCL_NUM_PROTOCOLS]; int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; /* This attribute can indicate the states of communicators and return code of @@ -532,7 +538,7 @@ struct ncclComm { int proxyRefCountOld; /* store proxy post-atomic-sub refcount */ // Whether this communicator uses collNet int collNetSupport; - bool collNetRegSupport; + bool isOneRPN; uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes]; bool intraNodeP2pSupport; int* collNetHeads; @@ -560,6 +566,7 @@ struct ncclComm { // Subset of those in groupNext list. Holds 0x1 if not needing preconnect. struct ncclComm* preconnectNext; int persistentRefs; // number of persistent plan-lists capturing this comm + int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream struct P2pSchedulePair { int sendRank; int recvRank; } *p2pSchedule; struct ncclKernelPlanner planner; @@ -599,9 +606,16 @@ struct ncclComm { // buffer registration cache struct ncclRegCache regCache; + int isAllNvlink; + bool useNetPXN; + bool useGdr; + int splitCount; uint64_t endMagic; }; +static_assert(offsetof(struct ncclComm, startMagic) == 0, "startMagic must be the first field of ncclComm"); +static_assert(offsetof(struct ncclComm, endMagic) == sizeof(struct ncclComm) - sizeof(uint64_t), "endMagic must be the last field of ncclComm"); + enum ncclLaunchMode { ncclLaunchModeInvalid=0, ncclLaunchModeParallel, @@ -644,7 +658,7 @@ inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm) { } } finish: - cudaThreadExchangeStreamCaptureMode(&mode); + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); return ncclSuccess; } diff --git a/src/include/debug.h b/src/include/debug.h index 491ac3e12..4e50cbf5a 100644 --- a/src/include/debug.h +++ b/src/include/debug.h @@ -38,4 +38,6 @@ extern char ncclLastError[]; void ncclSetThreadName(pthread_t thread, const char *fmt, ...); +void ncclResetDebugInit(); + #endif diff --git a/src/include/device.h b/src/include/device.h index 153b5ae36..0c861f595 100644 --- a/src/include/device.h +++ b/src/include/device.h @@ -88,24 +88,18 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK #define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8 #define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS) -#define NCCL_DIRECT_WRITE 0x01 -#define NCCL_DIRECT_READ 0x02 +#define NCCL_P2P_WRITE 0x01 +#define NCCL_P2P_READ 0x02 #define NCCL_DIRECT_NIC 0x04 -#define NCCL_IPC_WRITE 0x08 -#define NCCL_IPC_READ 0x10 -#define NCCL_NVLS_MIN_POLL 0x20 +#define NCCL_NVLS_MIN_POLL 0x80 // Number of named barriers supported by CUDA #define NCCL_MAX_GROUPS 16 -#define NCCL_MAX_COLLNET_SIZE (1L << 29) - -enum ncclRegBufferType { - NCCL_REGULAR_BUFFER = 0, - NCCL_IPC_REG_BUFFER = 1, - NCCL_NVLS_REG_BUFFER = 2, - NCCL_COLLNET_REG_BUFFER = 3 -}; +#define NCCL_REGULAR_BUFFER 0x00 +#define NCCL_IPC_REG_BUFFER 0x01 +#define NCCL_NVLS_REG_BUFFER 0x02 +#define NCCL_NET_REG_BUFFER 0x04 struct ncclConnInfo { // Regular comm mechanism @@ -143,8 +137,6 @@ struct ncclConnector { struct ncclTransportComm* transportComm; void* transportResources; struct ncclConnInfo conn; - int sendMemSameProcess; - int recvMemSameProcess; }; struct ncclRing { @@ -228,7 +220,7 @@ struct alignas(16) ncclDevWorkP2p { uint8_t sendChunkSize_u32fp8, recvChunkSize_u32fp8; uint8_t sendProtoLL:1, recvProtoLL:1; - uint8_t sendRegistered:1, recvRegistered:1; + uint8_t sendNetReg:1, recvNetReg:1; uint8_t sendIpcReg:1, recvIpcReg:1; }; @@ -267,7 +259,7 @@ struct alignas(16) ncclDevWorkColl { // nChannels == (channelHi - channelLo) + 1 uint32_t channelLo:8, channelHi:8; uint32_t nWarps:8; - uint32_t redOpArgIsPtr:1, regUsed:2, oneNode:1, direct:4; + uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1; uint32_t root; void* recvbuff; void* sendbuff; @@ -393,7 +385,7 @@ struct ncclDevComm { int nNodes; int buffSizes[NCCL_NUM_PROTOCOLS]; int p2pChunkSize; - int isNvlink; + int isAllNvlink; // Work fifo return credits uint32_t* workConsumed/*[MAXCHANNELS]*/; @@ -525,9 +517,7 @@ inline bool ncclNvlsSupported(int devRedOp, int type) { case ncclInt64: case ncclUint64: case ncclFloat16: - #if defined(__CUDA_BF16_TYPES_EXIST__) case ncclBfloat16: - #endif return devRedOp == ncclDevSum || devRedOp == ncclDevMinMax; case ncclFloat: case ncclDouble: diff --git a/src/include/enqueue.h b/src/include/enqueue.h index 1bb5a604f..3eb6c0743 100644 --- a/src/include/enqueue.h +++ b/src/include/enqueue.h @@ -25,5 +25,16 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); ncclResult_t ncclLaunchFinish(struct ncclComm* comm); ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo); +ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm); + +static inline size_t ncclFuncSendCount(ncclFunc_t func, int nRanks, size_t count) { + return func == ncclFuncReduceScatter ? nRanks*count : count; +} +static inline size_t ncclFuncRecvCount(ncclFunc_t func, int nRanks, size_t count) { + return func == ncclFuncAllGather ? nRanks*count : count; +} +static inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count) { + return func == ncclFuncAllGather || func == ncclFuncReduceScatter ? nRanks*count : count; +} #endif // End include guard diff --git a/src/include/graph.h b/src/include/graph.h index b6d86b398..602cc8cd9 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -19,7 +19,7 @@ ncclResult_t ncclTopoCudaPath(int cudaDev, char** path); struct ncclTopoSystem; // Build the topology -ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system); +ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system, const char* dumpXmlFile=NULL); ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system); ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system); @@ -33,10 +33,11 @@ ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm); // Query topology ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank); -ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank); +ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank); ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret); -ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int64_t netId, int read, int* useGdr); -ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush); +ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, int* useGdr); +ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush); +ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail); ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net); int ncclPxnDisable(struct ncclComm* comm); ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks); @@ -118,6 +119,6 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent); ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs); -ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time, bool* backup=nullptr); +ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time); #endif diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h index c3709584c..3a4c42bb2 100644 --- a/src/include/ibvwrap.h +++ b/src/include/ibvwrap.h @@ -12,6 +12,8 @@ #ifndef NCCL_IBVWRAP_H_ #define NCCL_IBVWRAP_H_ +#include +#include #ifdef NCCL_BUILD_RDMA_CORE #include #else @@ -89,4 +91,14 @@ static inline ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event); +// converts a GID into a readable string. On success, returns a non-null pointer to gidStr. +// NULL is returned if there was an error, with errno set to indicate the error. +// errno = ENOSPC if the converted string would exceed strLen. +static inline const char* ibvGetGidStr(union ibv_gid* gid, char* gidStr, size_t strLen) { + // GID is a 16B handle, to convert it to a readable form, we use inet_ntop + // sizeof(ibv_gid) == sizeof(struct in6_addr), so using AF_INET6 + static_assert(sizeof(union ibv_gid) == sizeof(struct in6_addr), "the sizeof struct ibv_gid must be the size of struct in6_addr"); + return inet_ntop(AF_INET6, gid->raw, gidStr, strLen); +} + #endif //End include guard diff --git a/src/include/nccl_common.h b/src/include/nccl_common.h index 26851b17e..fcf2251fe 100644 --- a/src/include/nccl_common.h +++ b/src/include/nccl_common.h @@ -32,6 +32,7 @@ typedef enum { NCCL_BOOTSTRAP = 0x1000, NCCL_REG = 0x2000, NCCL_PROFILE = 0x4000, + NCCL_RAS = 0x8000, NCCL_ALL = ~0 } ncclDebugLogSubSys; diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h index 467d9fdb8..f165aa1bf 100644 --- a/src/include/nccl_net.h +++ b/src/include/nccl_net.h @@ -13,6 +13,9 @@ #include #define NCCL_NET_HANDLE_MAXSIZE 128 +//Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties +#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) +#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1 #define NCCL_PTR_HOST 0x1 #define NCCL_PTR_CUDA 0x2 @@ -21,6 +24,18 @@ // Maximum number of requests per comm object #define NCCL_NET_MAX_REQUESTS 32 +// Max number of ncclNet objects which can live in the same process +#define NCCL_NET_MAX_PLUGINS 3 + +#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4 +#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9 + +typedef struct { + int ndevs; + int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9]; +} ncclNetVDeviceProps_v9_t; +typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t; + typedef struct { char* name; // Used mostly for logging. char* pciPath; // Path to the PCI device in /sys. @@ -28,6 +43,7 @@ typedef struct { // cards with multiple PCI functions (Physical or virtual). int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] int regIsGlobal; // regMr is not tied to a particular comm + int forceFlush; // Force a flush on receives int speed; // Port speed in Mbps. int port; // Port number. float latency; // Network latency @@ -35,9 +51,149 @@ typedef struct { int maxRecvs; // Maximum number of grouped receives. ncclNetDeviceType netDeviceType; // Network offload type int netDeviceVersion; // Version number for network offload -} ncclNetProperties_v8_t; + ncclNetVDeviceProps_v9_t vProps; + size_t maxP2pBytes; // Max transfer size for point-to-point operations + size_t maxCollBytes; // Max transfer size for collective operations +} ncclNetProperties_v9_t; +typedef ncclNetProperties_v9_t ncclNetProperties_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); -typedef ncclNetProperties_v8_t ncclNetProperties_t; + // Create a virtual NIC given the specified properties, which can be accessed at device index d + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props); +} ncclNet_v9_t; + +typedef ncclNet_v9_t ncclNet_t; + +#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v9 + +typedef struct { + void* mhandle; + void* address; + size_t size; +} ncclNetSGE_v9_t; + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + void* sendMhandle, void** request); + ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + ncclDataType_t dataType, ncclRedOp_t redOp, + void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Create a virtual NIC given the specified properties, which can be accessed at device index d + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props); +} ncclCollNet_v9_t; + +typedef ncclCollNet_v9_t ncclCollNet_t; + +#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v9 + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int regIsGlobal; // regMr is not tied to a particular comm + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload +} ncclNetProperties_v8_t; typedef struct { // Name of the network (mainly for logs) @@ -94,10 +250,6 @@ typedef struct { ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); } ncclNet_v8_t; -typedef ncclNet_v8_t ncclNet_t; - -#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v8 - typedef struct { void* mhandle; void* address; @@ -151,10 +303,6 @@ typedef struct { ncclResult_t (*closeListen)(void* listenComm); } ncclCollNet_v8_t; -typedef ncclCollNet_v8_t ncclCollNet_t; - -#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v8 - typedef struct { char* name; // Used mostly for logging. char* pciPath; // Path to the PCI device in /sys. diff --git a/src/include/nccl_profiler.h b/src/include/nccl_profiler.h index 556a0f6e4..a8164d075 100644 --- a/src/include/nccl_profiler.h +++ b/src/include/nccl_profiler.h @@ -16,7 +16,6 @@ enum { ncclProfileProxyOp = (1 << 3), // proxy operation event type ncclProfileProxyStep = (1 << 4), // proxy step event type ncclProfileProxyCtrl = (1 << 5), // proxy control event type - ncclProfileNumEvents = ( 6), }; typedef struct { @@ -28,28 +27,25 @@ typedef struct { const char* name; uint64_t commHash; uint64_t seqNumber; - uint8_t func; + const char* func; void const* sendBuff; void* recvBuff; size_t count; int root; - uint8_t datatype; - uint32_t op; + const char* datatype; size_t trafficBytes; uint8_t nMaxChannels; uint8_t nWarps; - uint8_t algo; - uint8_t proto; - int isCollnet; - int isNvls; + const char* algo; + const char* proto; } coll; struct { const char* name; uint64_t commHash; - uint8_t func; + const char* func; void* buff; - uint8_t datatype; + const char* datatype; size_t count; int peer; } p2p; @@ -67,7 +63,7 @@ typedef struct { int step; } proxyStep; }; -} ncclProfilerEventDescr_v1_t; +} ncclProfilerEventDescr_v2_t; typedef enum { ncclProfilerProxyOpSendPosted, @@ -93,7 +89,7 @@ typedef enum { ncclProfilerProxyCtrlWakeup, ncclProfilerProxyCtrlAppend, ncclProfilerProxyCtrlAppendEnd, -} ncclProfilerEventState_v1_t; +} ncclProfilerEventState_v2_t; typedef union { struct { @@ -104,7 +100,101 @@ typedef union { struct { int appendedProxyOps; } proxyCtrl; -} ncclProfilerEventStateArgs_v1_t; +} ncclProfilerEventStateArgs_v2_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, int* eActivationMask); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v2_t; + +typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t; +typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t; +typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t; +typedef ncclProfiler_v2_t ncclProfiler_t; + +typedef struct { + uint8_t type; // event type descriptor: ncclProfileColl, ... + void* parentObj; // pointer to the profiler parent object (for coll is the group) + int rank; // originating rank + union { + struct { + const char* name; + uint64_t commHash; + uint64_t seqNumber; + uint8_t func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + uint8_t datatype; + uint32_t op; + size_t trafficBytes; + uint8_t nMaxChannels; + uint8_t nWarps; + uint8_t algo; + uint8_t proto; + int isCollnet; + int isNvls; + } coll; + + struct { + const char* name; + uint64_t commHash; + uint8_t func; + void* buff; + uint8_t datatype; + size_t count; + int peer; + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + }; +} ncclProfilerEventDescr_v1_t; + +typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t; +typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t; typedef struct { const char* name; @@ -142,9 +232,4 @@ typedef struct { ncclResult_t (*finalize)(void* context); } ncclProfiler_v1_t; -typedef ncclProfilerEventDescr_v1_t ncclProfilerEventDescr_t; -typedef ncclProfilerEventState_v1_t ncclProfilerEventState_t; -typedef ncclProfilerEventStateArgs_v1_t ncclProfilerEventStateArgs_t; -typedef ncclProfiler_v1_t ncclProfiler_t; - #endif diff --git a/src/include/nccl_tuner.h b/src/include/nccl_tuner.h index 5cd02149f..6e61118b9 100644 --- a/src/include/nccl_tuner.h +++ b/src/include/nccl_tuner.h @@ -33,6 +33,7 @@ typedef struct { // - numPipeOps: number of operations in the group // - numAlgo: number of algorithms in collCostTable // - numProto: number of protocols in collCostTable + // - regBuff: can register user buffer // // Outputs: // - nChannels: number of channels (hence SMs) to be used. @@ -48,16 +49,60 @@ typedef struct { // Unset fields will be set automatically by NCCL. ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, - int* nChannels); + int regBuff, int* nChannels); // Terminates the plugin and cleans up any resources that the plugin allocated. // context: tuner context object ncclResult_t (*destroy)(void* context); -} ncclTuner_v3_t; +} ncclTuner_v4_t; + +typedef ncclTuner_v4_t ncclTuner_t; + +#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4" + +// API to be implemented by external tuner +typedef struct { + // Name of the tuner + const char* name; -typedef ncclTuner_v3_t ncclTuner_t; + // Initializes tuner states. + // Inputs: + // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. + // - nNodes: number of nodes in current communicator. + // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. + // Outputs: + // - context: tuner context object + ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); + + // Gets info (algo, protocol, number of ctas and threads) for a given collective. + // Inputs: + // - context: tuner context object + // - collType: collective type , e.g., allreduce, allgather… + // - nBytes: collective size in bytes + // - numPipeOps: number of operations in the group + // - numAlgo: number of algorithms in collCostTable + // - numProto: number of protocols in collCostTable + // + // Outputs: + // - nChannels: number of channels (hence SMs) to be used. + // + // InOut: + // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. + // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). + // + // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the + // default tuning for the given collective. + // Also, the plugin is allowed to not set any output, or set only the + // algorithm and protocol, but not only the algorithm or only the protocol. + // Unset fields will be set automatically by NCCL. + ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, + int numPipeOps, float** collCostTable, int numAlgo, int numProto, + int* nChannels); -#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3" + // Terminates the plugin and cleans up any resources that the plugin allocated. + // context: tuner context object + ncclResult_t (*destroy)(void* context); +} ncclTuner_v3_t; // API to be implemented by external tuner typedef struct { diff --git a/src/include/net_device.h b/src/include/net_device.h index 7bb2968c0..5fae9b542 100644 --- a/src/include/net_device.h +++ b/src/include/net_device.h @@ -25,6 +25,7 @@ typedef struct { } ncclNetDeviceHandle_v7_t; typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t; -typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_t; +typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t; +typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t; #endif diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h index 7dee7d4ae..72fbf9ce2 100644 --- a/src/include/nvmlwrap.h +++ b/src/include/nvmlwrap.h @@ -302,7 +302,7 @@ extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMa struct ncclNvmlCCStatus { bool CCEnabled; - bool multiGpuCCEnabled; + bool multiGpuProtectedPCIE; }; // All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly. diff --git a/src/include/profiler.h b/src/include/profiler.h index 36774dc84..2b7efe0f6 100644 --- a/src/include/profiler.h +++ b/src/include/profiler.h @@ -36,9 +36,9 @@ ncclResult_t ncclProfilerStartRecvProxyOpEvent(int sub, struct ncclProxyArgs* ar ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args); // Proxy Step Start/Stop Event Wrappers -ncclResult_t ncclProfilerStartSendProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi); -ncclResult_t ncclProfilerStartRecvProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi); -ncclResult_t ncclProfilerStopProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi); +ncclResult_t ncclProfilerStartSendProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId); +ncclResult_t ncclProfilerStartRecvProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId); +ncclResult_t ncclProfilerStopProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId); // Proxy Control Start/Stop Events Wrappers ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle); @@ -46,7 +46,7 @@ ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle); // Record Event Wrappers ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState); -ncclResult_t ncclProfilerRecordProxyStepEventStates(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi, ncclProfilerEventState_t eState); +ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState); ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState); // Profiler utility functions diff --git a/src/include/proxy.h b/src/include/proxy.h index a1c44d6b1..b6ef0fa9d 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -15,6 +15,7 @@ #include #include "shmutils.h" #include "p2p.h" +#include "collectives.h" typedef enum : uint8_t { ncclPatternRing, @@ -56,7 +57,11 @@ struct ncclProxyOp { int root; int next; int nsteps; - int chunkSize; + size_t chunkSize; + size_t sliceSize; + size_t loopSize; + size_t loopOffset; + size_t channelSize; uint8_t sliceSteps; uint8_t chunkSteps; uint8_t channelId; @@ -65,13 +70,15 @@ struct ncclProxyOp { uint8_t /*ncclFunc_t*/ coll; uint8_t /*ncclPattern_t*/ pattern; uint8_t protocol; + uint8_t algorithm; uint8_t reg; - // collnet buffer reg handles + // collnet/p2p/coll buffer reg handles void* sendMhandle; void* recvMhandle; uint8_t* sendbuff; uint8_t* recvbuff; - + int isOneRPN; + RingAlgorithm *ringAlgo; union ncclProxyOpSpecifics specifics; // Profiler plugin @@ -93,19 +100,21 @@ struct ncclProxyOp { struct ncclProxySubArgs { struct ncclProxyConnection* connection; int reg; - // p2p mhandle - void* mhandle; // collnet handles void* sendMhandle; void* recvMhandle; uint8_t* sendbuff; uint8_t* recvbuff; size_t offset; + ssize_t loopSize; + ssize_t loopOffset; int channelId; int nsteps; ssize_t nbytes; + ssize_t chunkSize; int peer; - + int isOneRPN; + RingAlgorithm *ringAlgo; int groupSize; // Number of consecutive sub operations sharing the same recvComm uint64_t base; uint64_t posted; @@ -114,11 +123,14 @@ struct ncclProxySubArgs { uint64_t transmitted; uint64_t done; uint64_t end; + int regBufferReady; void* requests[NCCL_STEPS]; // Profiler plugin int eActivationMask; int rank; + pid_t pid; + void* profilerContext; void* taskEventHandle; void* opEventHandle; void* stepEventHandles[NCCL_STEPS]; @@ -133,10 +145,11 @@ struct ncclProxyArgs { proxyProgressFunc_t progress; int nsubs; int done; + int onePPN; uint64_t opCount; int sliceSteps; int chunkSteps; - int chunkSize; + size_t chunkSize; size_t totalSendSize; size_t totalRecvSize; size_t sendSizePerRound; @@ -146,16 +159,13 @@ struct ncclProxyArgs { uint8_t /*ncclPattern_t*/ pattern; uint8_t /*ncclFunc_t*/ coll; uint8_t protocol; + uint8_t algorithm; int state; char* sharedBuff[NCCL_STEPS]; int sharedSize[NCCL_STEPS]; int idle; - // Profiler plugin - pid_t pid; - void* profilerContext; - // Element linking struct ncclProxyArgs* next; struct ncclProxyArgs* nextPeer; diff --git a/src/include/ras.h b/src/include/ras.h new file mode 100644 index 000000000..7909b3dc8 --- /dev/null +++ b/src/include/ras.h @@ -0,0 +1,24 @@ +/************************************************************************* + * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_RAS_H_ +#define NCCL_RAS_H_ + +#include "socket.h" + +// Structure used to communicate data about NCCL ranks from NCCL threads to RAS. +struct rasRankInit { + union ncclSocketAddress addr; + pid_t pid; + int cudaDev; + int nvmlDev; +}; + +ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank); +ncclResult_t ncclRasCommFini(const struct ncclComm* comm); +ncclResult_t ncclRasAddRanks(struct rasRankInit* ranks, int nranks); + +#endif // !NCCL_RAS_H_ diff --git a/src/include/register.h b/src/include/register.h index 7c60535d9..740a645f4 100644 --- a/src/include/register.h +++ b/src/include/register.h @@ -6,6 +6,9 @@ #include #include +int64_t ncclParamLocalRegister(); +int64_t ncclParamGraphRegister(); + enum { NET_REG_COMPLETE = 0x01, NVLS_REG_COMPLETE = 0x02, @@ -20,16 +23,21 @@ struct ncclPeerRegIpcAddr { uintptr_t* hostPeerRmtAddrs; }; +struct ncclRegNetHandles { + void* handle; + struct ncclProxyConnector* proxyConn; + struct ncclRegNetHandles* next; +}; + struct ncclReg { // common attributes size_t pages; - int refs; + int localRefs; + int graphRefs; uintptr_t addr; uint32_t state; // net reg - int nDevs; - int devs[MAXCHANNELS]; - void** handles; + struct ncclRegNetHandles* netHandleHead; // nvls reg uintptr_t baseAddr; size_t baseSize; @@ -50,11 +58,12 @@ struct ncclRegCache { struct ncclReg **slots; int capacity, population; uintptr_t pageSize; - void* sComms[MAXCHANNELS]; - void* rComms[MAXCHANNELS]; }; ncclResult_t ncclRegCleanup(struct ncclComm* comm); ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg); +ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle); +ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle); +ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid); #endif diff --git a/src/include/shmutils.h b/src/include/shmutils.h index 43e8afb79..097b4c657 100644 --- a/src/include/shmutils.h +++ b/src/include/shmutils.h @@ -10,7 +10,7 @@ #include "nccl.h" typedef void* ncclShmHandle_t; -ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle); +ncclResult_t ncclShmOpen(char* shmPath, size_t shmPathSize, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle); ncclResult_t ncclShmClose(ncclShmHandle_t handle); ncclResult_t ncclShmUnlink(ncclShmHandle_t handle); diff --git a/src/include/socket.h b/src/include/socket.h index 60a413875..f0a3237ce 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -17,9 +17,6 @@ #define MAX_IFS 16 #define MAX_IF_NAME_SIZE 16 -#define SLEEP_INT 1000 // connection retry sleep interval in usec -#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec) -#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s) #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV) #define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL @@ -39,9 +36,10 @@ enum ncclSocketState { ncclSocketStateConnectPolling = 5, ncclSocketStateConnected = 6, ncclSocketStateReady = 7, - ncclSocketStateClosed = 8, - ncclSocketStateError = 9, - ncclSocketStateNum = 10 + ncclSocketStateTerminating = 8, + ncclSocketStateClosed = 9, + ncclSocketStateError = 10, + ncclSocketStateNum = 11 }; enum ncclSocketType { @@ -49,14 +47,14 @@ enum ncclSocketType { ncclSocketTypeBootstrap = 1, ncclSocketTypeProxy = 2, ncclSocketTypeNetSocket = 3, - ncclSocketTypeNetIb = 4 + ncclSocketTypeNetIb = 4, + ncclSocketTypeRasNetwork = 5 }; struct ncclSocket { int fd; int acceptFd; - int timedOutRetries; - int refusedRetries; + int errorRetries; union ncclSocketAddress addr; volatile uint32_t* abortFlag; int asyncFlag; @@ -64,15 +62,18 @@ struct ncclSocket { int salen; uint64_t magic; enum ncclSocketType type; + int customRetry; + int finalizeCounter; // Used to keep track of initial handshake for async sockets. + char finalizeBuffer[sizeof(uint64_t)]; // Used to keep track of initial handshake for async sockets. }; -const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1); +const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1); ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair); int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs); int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs); // Initialize a socket -ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0); +ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0, int customRetry = 0); // Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call ncclResult_t ncclSocketListen(struct ncclSocket* sock); ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr); @@ -88,11 +89,12 @@ ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock); #define NCCL_SOCKET_SEND 0 #define NCCL_SOCKET_RECV 1 -ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); +ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* closed = NULL); ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size); ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size); ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize); ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking); +ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how); ncclResult_t ncclSocketClose(struct ncclSocket* sock); #endif diff --git a/src/include/transport.h b/src/include/transport.h index cbeb613ca..37187f69e 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -28,7 +28,6 @@ extern struct ncclTransport netTransport; extern struct ncclTransport collNetTransport; extern struct ncclTransport* ncclTransports[]; - // Forward declarations struct ncclRing; struct ncclConnector; @@ -115,16 +114,16 @@ struct ncclTransport { }; ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex); -ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL); +ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex); ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode); ncclResult_t ncclNvlsInit(struct ncclComm* comm); ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent); ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm); ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm); -ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue* cleanupQueue, int* nCleanupQueueElts); -ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv); -ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size); +ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue* cleanupQueue, int* nCleanupQueueElts); +ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv); +ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size); ncclResult_t ncclNvlsFree(struct ncclComm* comm); enum { collNetRecv=0, collNetSend=1 }; @@ -143,4 +142,13 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm); ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm); +ncclResult_t ncclNetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* handle); +ncclResult_t ncclNetLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, struct ncclConnector** peerConns, int nPeers, int* outRegBufFlag, void** outHandle); +ncclResult_t ncclNetGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, struct ncclConnector** peerConns, int nPeers, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue* cleanupQueue, int* nCleanupQueueElts); + +ncclResult_t ncclRegisterP2pIpcBuffer(struct ncclComm* comm, void* userbuff, size_t size, int peerRank, int* regFlag, void** regAddr, struct ncclIntruQueue* cleanupQueue); +ncclResult_t ncclRegisterP2pNetBuffer(struct ncclComm* comm, void* userbuff, size_t size, struct ncclConnector* conn, int* regFlag, void** handle, struct ncclIntruQueue* cleanupQueue); +ncclResult_t ncclRegisterCollBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue* cleanupQueue, bool* regNeedConnect); +ncclResult_t ncclRegisterCollNvlsBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue* cleanupQueue, bool* regNeedConnect); + #endif diff --git a/src/include/utils.h b/src/include/utils.h index 5a1b749a7..383f678c8 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -49,8 +49,7 @@ inline uint64_t clockNano() { return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec; } -/* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else - * return -1 */ +/* get any bytes of random data from /dev/urandom, return ncclSuccess (0) if it succeeds. */ inline ncclResult_t getRandomData(void* buffer, size_t bytes) { ncclResult_t ret = ncclSuccess; if (bytes > 0) { diff --git a/src/init.cc b/src/init.cc index 94c2fb10e..5caaaae09 100644 --- a/src/init.cc +++ b/src/init.cc @@ -17,6 +17,7 @@ #include "graph.h" #include "argcheck.h" #include "tuner.h" +#include "ras.h" #include #include #include @@ -182,6 +183,8 @@ static ncclResult_t commFree(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; + NCCLCHECK(ncclRasCommFini(comm)); + /* in commReclaim, we have guaranteed only last rank which calls ncclCommDestroy() will * free all intra-process communicators; therefore, we only need to focus on local * resource cleanup in commFree(). */ @@ -193,7 +196,7 @@ static ncclResult_t commFree(ncclComm_t comm) { } } - CUDACHECK(cudaMemPoolDestroy(comm->memPool)); + if (comm->memPool) CUDACHECK(cudaMemPoolDestroy(comm->memPool)); delete[] comm->userRedOps; @@ -421,11 +424,6 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in ncclIntruQueueConstruct(&comm->eventCallbackQueue); - // setup intraComm0 and intraRanks 0 to default values to ensure proper cleanup of the communicator - comm->intraComm0 = comm; - comm->intraRank = 0; - comm->intraRanks = 1; - return ncclSuccess; } @@ -435,6 +433,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { struct ncclDevCommAndChannels tmpCommAndChans; struct ncclDevCommAndChannels *devCommAndChans = NULL; struct ncclNvmlCCStatus ccStatus; + bool ccEnable; NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail); NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail); @@ -448,7 +447,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { tmpCommAndChans.comm.node = comm->node; tmpCommAndChans.comm.nNodes = comm->nNodes; tmpCommAndChans.comm.abortFlag = comm->abortFlagDev; - tmpCommAndChans.comm.isNvlink = ncclTopoPathAllNVLink(comm->topo); + tmpCommAndChans.comm.isAllNvlink = comm->isAllNvlink; for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) { tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p]; } @@ -458,11 +457,9 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { comm->workArgsBytes = std::min(ncclParamWorkArgsBytes(), ncclMaxKernelArgsSize(comm->cudaArch)); memset(&ccStatus, 0, sizeof(ccStatus)); - if (ncclNvmlGetCCStatus(&ccStatus) == ncclSuccess && ccStatus.CCEnabled) { + ccEnable = (ncclSuccess == ncclNvmlGetCCStatus(&ccStatus)) && (ccStatus.CCEnabled || ccStatus.multiGpuProtectedPCIE); + if (ccEnable) { comm->workFifoBytes = 0; - if (ccStatus.multiGpuCCEnabled == false && comm->rank == 0) { - WARN("CC On, Multi-GPU CC Off (No inter-GPU communication protection)"); - } } else { comm->workFifoBytes = ncclParamWorkFifoBytes(); if (0 != (comm->workFifoBytes & (comm->workFifoBytes-1))) { @@ -473,7 +470,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { } if (comm->rank == 0) { - INFO(NCCL_INIT, "CC %s, Multi-GPU CC %s, workFifoBytes %d", ccStatus.CCEnabled ? "On" : "Off", ccStatus.multiGpuCCEnabled ? "On" : "Off", comm->workFifoBytes); + INFO(NCCL_INIT, "CC %s, workFifoBytes %d", ccEnable ? "On" : "Off", comm->workFifoBytes); } if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) { @@ -608,9 +605,6 @@ NCCL_PARAM(P2pPciChunkSize, "P2P_PCI_CHUNKSIZE", (1 << 17)); /* 128 kB */ NCCL_PARAM(P2pNvlChunkSize, "P2P_NVL_CHUNKSIZE", (1 << 19)); /* 512 kB */ static ncclResult_t computeBuffSizes(struct ncclComm* comm) { - int cpuArch, cpuVendor, cpuModel; - NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel)); - int64_t envs[NCCL_NUM_PROTOCOLS] = { ncclParamLlBuffSize(), ncclParamLl128BuffSize(), ncclParamBuffSize() }; int defaults[NCCL_NUM_PROTOCOLS] = { DEFAULT_LL_BUFFSIZE, DEFAULT_LL128_BUFFSIZE, DEFAULT_BUFFSIZE }; @@ -619,7 +613,7 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) { } if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize(); - else if (ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize(); + else if (comm->isAllNvlink) comm->p2pChunkSize = ncclParamP2pNvlChunkSize(); else comm->p2pChunkSize = ncclParamP2pPciChunkSize(); // Make sure P2P chunksize is not larger than coll chunksize. @@ -850,6 +844,14 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p } while(0); timers[TIMER_INIT_TOPO] = clockNano(); + + // Dump XML if requested by user + const char* dumpXmlFile; + dumpXmlFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE"); + if (dumpXmlFile) { + NCCLCHECKGOTO(ncclTopoGetSystem(comm, NULL, dumpXmlFile), ret, fail); + } + // Topo detection / System graph creation NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail); // Compute paths between GPUs and NICs @@ -1076,9 +1078,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold); comm->collNetSupport = 0; } - // As long as there is more than 1 rank on any node, we need to disable collnet reg - comm->collNetRegSupport = (comm->maxLocalRanks == 1); } + comm->isAllNvlink = ncclTopoPathAllNVLink(comm->topo); + comm->isOneRPN = (comm->maxLocalRanks == 1); NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail); NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, parent), ret, fail); @@ -1293,7 +1295,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p // launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock. NCCLCHECKGOTO(devCommSetup(comm), ret, fail); timers[TIMER_INIT_CONNECT] = clockNano() - timers[TIMER_INIT_CONNECT]; - /* Local intra-node barrier */ NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail); @@ -1338,6 +1339,7 @@ struct ncclCommInitRankAsyncJob { // for ncclCommSplit struct ncclComm* parent; int color, key; + int splitCount; // name of the function calling char funcName[NCCL_COMMINIT_FUNCNAME_LEN]; }; @@ -1432,13 +1434,14 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { timers[TIMER_INIT_ALLOC] = clockNano(); NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail); timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC]; - // obtain a unique hash for the comm, re-using part of the parent's hash, commHash is a 64bit struct (=16 hex), add the color + // obtain a unique hash for the comm, re-using part of the parent's hash, commHash is a 64bit struct (=16 hex), + // add unique split counter and the color ncclUniqueId tmpId; memset(&tmpId,0,sizeof(ncclUniqueId));// must set 0 here to avoid undefined bits - snprintf((char*)&tmpId, NCCL_UNIQUE_ID_BYTES, "%016lx-%d", job->parent->commHash, job->color); + snprintf((char*)&tmpId, NCCL_UNIQUE_ID_BYTES, "%016lx-%d-%d", job->parent->commHash, job->splitCount, job->color); comm->commHash = getHash(tmpId.internal, NCCL_UNIQUE_ID_BYTES); - INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d- Init START", job->funcName, - comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key); + INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d- Init START", job->funcName, + comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key); timers[TIMER_INIT_BOOTSTRAP] = clockNano(); NCCLCHECKGOTO(bootstrapSplit(comm->commHash, comm, job->parent, job->color, job->key, parentRanks), res, fail); timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP]; @@ -1474,8 +1477,8 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { /* unlink child abort flag. */ __atomic_store_n(&job->parent->childAbortFlag, NULL, __ATOMIC_RELEASE); TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)", job->parent, job->color, job->key, comm, comm->rank, comm->nRanks); - INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d - Init COMPLETE", job->funcName, - comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key); + INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d - Init COMPLETE", job->funcName, + comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key); } else { // the name for the replay tool is ncclCommInitRank for all the variations TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)", comm, comm->nRanks, commIdHash, comm->rank, comm->cudaDev); @@ -1716,8 +1719,8 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId comm->startMagic = comm->endMagic = NCCL_MAGIC; // Used to detect comm corruption. *comm->abortFlagRefCount = 1; NCCLCHECKGOTO(parseCommConfig(comm, config), res, fail); - /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */ - comm->initState = ncclInternalError; + /* start with ncclInProgress and will be changed to ncclSuccess if init succeeds. */ + comm->initState = ncclInProgress; *newcomm = comm; NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail); @@ -1749,6 +1752,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId exit: return ncclGroupErrCheck(res); fail: + if (job) ncclCommInitJobFree(job); if (comm) { free(comm->abortFlag); if (comm->abortFlagDev) (void)ncclCudaHostFree((void*)comm->abortFlagDev); @@ -1846,7 +1850,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { NCCLCHECKGOTO(ncclGroupEnd(), ret, fail); exit: - cudaSetDevice(oldDev); + (void)cudaSetDevice(oldDev); free(gpuFlags); return ret; fail: @@ -1926,14 +1930,9 @@ ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myran static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) { struct ncclCommFinalizeAsyncJob* job = (struct ncclCommFinalizeAsyncJob*) job_; ncclComm_t comm = job->comm; - int savedDevice; - int commDevice = comm->cudaDev; ncclResult_t ret = ncclSuccess; - CUDACHECKGOTO(cudaGetDevice(&savedDevice), ret, fail); - if (savedDevice != commDevice) { - CUDACHECKGOTO(cudaSetDevice(commDevice), ret, fail); - } + CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail); TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d asyncResult %d", comm, comm->rank, *comm->abortFlag, comm->asyncResult); @@ -1963,10 +1962,6 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) { WARN("ncclProxyStop: comm %p (rank = %d) destroys proxy resource error %d", comm, comm->rank, ret); } - if (savedDevice != commDevice) { - CUDACHECKGOTO(cudaSetDevice(savedDevice), ret, fail); - } - exit: return ret; fail: @@ -1974,25 +1969,12 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) { } static ncclResult_t commCleanup(ncclComm_t comm) { - int savedDevice; - int commDevice = comm->cudaDev; - - CUDACHECK(cudaGetDevice(&savedDevice)); - if (savedDevice != commDevice) { - CUDACHECK(cudaSetDevice(commDevice)); - } - + CUDACHECK(cudaSetDevice(comm->cudaDev)); if (comm->tuner != NULL) { NCCLCHECK(comm->tuner->destroy(comm->tunerContext)); NCCLCHECK(ncclTunerPluginUnload(comm)); } - NCCLCHECK(commFree(comm)); - - if (savedDevice != commDevice) { - CUDACHECK(cudaSetDevice(savedDevice)); - } - return ncclSuccess; } @@ -2099,6 +2081,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) { NVTX3_FUNC_WITH_PARAMS(CommDestroy, CommInitRankSchema, payload) TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId); + NCCLCHECK(ncclGroupStartInternal()); // Try and prevent a double free of the comm struct (user error) if (comm->rank == -1 || comm->nRanks == -1 || comm->cudaDev == -1 || comm->busId == -1) { WARN("comm %p has already been destroyed", comm); @@ -2113,6 +2096,8 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) { NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail); exit: + ncclGroupErrCheck(res); + NCCLCHECK(ncclGroupEndInternal()); return res; fail: goto exit; @@ -2124,7 +2109,7 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) { NVTX3_FUNC_RANGE_IN(nccl_domain); return ncclSuccess; } - + NCCLCHECK(ncclGroupStartInternal()); // Ask anything that might still be running on the device to quit if (comm->childAbortFlag != nullptr) { __atomic_store_n(comm->childAbortFlag, 1, __ATOMIC_RELEASE); @@ -2152,6 +2137,8 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) { NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail); exit: + ncclGroupErrCheck(res); + NCCLCHECK(ncclGroupEndInternal()); return ncclSuccess; fail: goto exit; @@ -2218,14 +2205,15 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc NCCLCHECKGOTO(parseCommConfig(childComm, config), res, fail); } - /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */ - childComm->initState = ncclInternalError; + /* start with ncclInProgress and will be changed to ncclSuccess if init succeeds. */ + childComm->initState = ncclInProgress; } NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail); job->comm = childComm; job->newcomm = newcomm; job->parent = comm; + job->splitCount = ++comm->splitCount; job->color = color; job->key = key; job->cudaDev = comm->cudaDev; @@ -2233,13 +2221,13 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, free, comm), res, fail); exit: - cudaSetDevice(oldDev); + (void)cudaSetDevice(oldDev); (void)ncclGroupErrCheck(res); NCCLCHECK(ncclGroupEndInternal()); return res; fail: if (childComm) { - if (comm && !comm->config.splitShare) { + if (!comm->config.splitShare) { free(childComm->abortFlag); if (childComm->abortFlagDev) ncclCudaHostFree(childComm->abortFlagDev); free(childComm->abortFlagRefCount); @@ -2347,14 +2335,12 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) { CUDACHECK(cudaGetDevice(&cudaDev)); CUCHECK(cuDeviceGet(¤tDev, cudaDev)); - if (CUPFN(cuMulticastCreate) != NULL) - CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev)); - if (mcSupport) { + if (ncclCuMemEnable()) { int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; // Query device to see if FABRIC handle support is available flag = 0; - (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));; + (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev)); if (flag) requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC; memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED; memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; @@ -2365,18 +2351,24 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) { CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev)); if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1; CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); - - /* mc property */ CUDACHECK(cudaGetDeviceCount(&dcnt)); - mcprop.size = size; - /* device cnt is a dummy value right now, it might affect mc granularity in the future. */ - mcprop.numDevices = dcnt; - mcprop.handleTypes = requestedHandleTypes; - mcprop.flags = 0; - CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED)); - - /* only size needs to be aligned to mcGran */ - ALIGN_SIZE(size, mcGran); + + if (CUPFN(cuMulticastCreate) != NULL) CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev)); + if (mcSupport) { + /* mc property */ + mcprop.size = size; + /* device cnt is a dummy value right now, it might affect mc granularity in the future. */ + mcprop.numDevices = dcnt; + mcprop.handleTypes = requestedHandleTypes; + mcprop.flags = 0; + CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED)); + + /* only size needs to be aligned to mcGran */ + ALIGN_SIZE(size, mcGran); + } else { + ALIGN_SIZE(size, memGran); + } + if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) { /* First try cuMemCreate() with FABRIC handle support and then remove if it fails */ CUresult err = CUPFN(cuMemCreate(&handle, size, &memprop, 0)); @@ -2403,6 +2395,7 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) { accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1)); } + if (0 == p2p && i != cudaDev) INFO(NCCL_ALLOC, "P2P not supported between GPU%d and GPU%d", cudaDev, i); } goto exit; } @@ -2429,18 +2422,13 @@ ncclResult_t ncclMemFree(void *ptr) { CUDACHECK(cudaGetDevice(&saveDevice)); #if CUDART_VERSION >= 12010 CUdevice ptrDev = 0; - int mcSupport = 0; if (ptr == NULL) goto fallback; - if (ncclCudaLibraryInit() != ncclSuccess) goto fallback; CUCHECKGOTO(cuPointerGetAttribute((void*)&ptrDev, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr), ret, fail); - if (CUPFN(cuMulticastCreate) != NULL) - CUCHECKGOTO(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, ptrDev), ret, fail); - CUDACHECKGOTO(cudaSetDevice((int)ptrDev), ret, fail); - if (mcSupport) { + if (ncclCuMemEnable()) { NCCLCHECKGOTO(ncclCuMemFree(ptr), ret, fail); goto exit; } diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc index 03e3bde99..e5fec1e46 100644 --- a/src/misc/cudawrap.cc +++ b/src/misc/cudawrap.cc @@ -11,7 +11,7 @@ // This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2); -NCCL_PARAM(CuMemHostEnable, "CUMEM_HOST_ENABLE", 0); +NCCL_PARAM(CuMemHostEnable, "CUMEM_HOST_ENABLE", -1); // Handle type used for cuMemCreate() CUmemAllocationHandleType ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; @@ -35,9 +35,6 @@ int ncclIsCuMemSupported() { // Query device to see if CUMEM VMM support is available CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, currentDev), ret, error); if (!flag) return 0; - // Query device to see if CUMEM RDMA support is available - CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev), ret, error); - if (!flag) return 0; error: return (ret == ncclSuccess); #endif @@ -49,11 +46,31 @@ int ncclCuMemEnable() { return param >= 0 ? param : (param == -2 && ncclCuMemSupported); } +static int ncclCumemHostEnable = -1; int ncclCuMemHostEnable() { + if (ncclCumemHostEnable != -1) + return ncclCumemHostEnable; #if CUDART_VERSION < 12020 - return 0; + ncclCumemHostEnable = 0; + return ncclCumemHostEnable; #else - return ncclParamCuMemHostEnable(); + ncclResult_t ret = ncclSuccess; + int cudaDriverVersion; + int paramValue = -1; + CUDACHECKGOTO(cudaDriverGetVersion(&cudaDriverVersion), ret, error); + if (cudaDriverVersion < 12020) { + ncclCumemHostEnable = 0; + } + else { + paramValue = ncclParamCuMemHostEnable(); + if (paramValue != -1) + ncclCumemHostEnable = paramValue; + else + ncclCumemHostEnable = (cudaDriverVersion >= 12060) ? 1 : 0; + } + return ncclCumemHostEnable; +error: + return (ret == ncclSuccess); #endif } @@ -218,10 +235,9 @@ static void initOnceFunc() { // Determine whether we support the cuMem APIs or not ncclCuMemSupported = ncclIsCuMemSupported(); -#if 12020 <= CUDART_VERSION && CUDART_VERSION <= 12030 - /* To use cuMem* for host memory allocation, we need to create context on each - * visible device. This is workaround needed in CUDA 12.3 which is fixed in 12.4. */ - if (ncclCuMemSupported && ncclCuMemHostEnable()) { + /* To use cuMem* for host memory allocation, we need to create context on each visible device. + * This is a workaround needed in CUDA 12.2 and CUDA 12.3 which is fixed in 12.4. */ + if (ncclCuMemSupported && ncclCuMemHostEnable() && 12020 <= driverVersion && driverVersion <= 12030) { int deviceCnt, saveDevice; cudaGetDevice(&saveDevice); cudaGetDeviceCount(&deviceCnt); @@ -231,7 +247,6 @@ static void initOnceFunc() { } cudaSetDevice(saveDevice); } -#endif initResult = ret; return; error: diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc index eb4e52b60..698465ca4 100644 --- a/src/misc/ibvwrap.cc +++ b/src/misc/ibvwrap.cc @@ -8,6 +8,7 @@ #include #include +#include "ibvcore.h" #include "ibvsymbols.h" static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; @@ -53,7 +54,7 @@ ncclResult_t wrap_ibv_symbols(void) { } \ int ret = container.call; \ if (ret == ENOTSUP || ret == EOPNOTSUPP) { \ - INFO(NCCL_NET, "Call to " name " failed with error %s errno %d", strerror(ret), ret); \ + INFO(NCCL_NET, "Call to " name " not supported"); \ *supported = 0; \ return ncclSuccess; \ } else if (ret != success_retval) { \ @@ -87,6 +88,14 @@ ncclResult_t wrap_ibv_symbols(void) { container.call; \ return ncclSuccess; +NCCL_PARAM(IbMQpRetryAll, "IB_MQP_RETRY_ALL", 0); +NCCL_PARAM(IbMQpRetryCnt, "IB_MQP_RETRY_CNT", 34); +NCCL_PARAM(IbMQpRetryTimeout, "IB_MQP_RETRY_SLEEP_MSEC", 100); // in milliseconds + +#define IBV_ERR_EQ(e, code) (e == code || e == (-code)) +#define IBV_MQP_RETRY_ERRNO(e) (IBV_ERR_EQ(e, ETIMEDOUT)) +#define IBV_MQP_RETRY_ERRNO_ALL(e) (ncclParamIbMQpRetryAll() ? (e != 0) : IBV_MQP_RETRY_ERRNO(e)) + ncclResult_t wrap_ibv_fork_init() { IBV_INT_CHECK(ibvSymbols, ibv_internal_fork_init, ibv_internal_fork_init(), -1, "ibv_fork_init"); } @@ -202,8 +211,87 @@ ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct i IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp"); } -ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ - IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp"); +static void ibvQpStateName(enum ibv_qp_state state, char* msg, const size_t len) { + switch (state) { + case (IBV_QPS_RESET): snprintf(msg, len, "RESET"); break; + case (IBV_QPS_INIT): snprintf(msg, len, "INIT"); break; + case (IBV_QPS_RTR): snprintf(msg, len, "RTR"); break; + case (IBV_QPS_RTS): snprintf(msg, len, "RTS"); break; + case (IBV_QPS_SQD): snprintf(msg, len, "SQD"); break; + case (IBV_QPS_SQE): snprintf(msg, len, "SQE"); break; + case (IBV_QPS_ERR): snprintf(msg, len, "ERR"); break; + case (IBV_QPS_UNKNOWN): snprintf(msg, len, "UNKNOWN"); break; + default: snprintf(msg, len, "NOT RECOGNIZED (%d)", state); break; + } +} + +#define QP_ATTR(attr, userAttr, userFlag, mask) ((userFlag & mask) ? (userAttr) : (attr)) + +static void ibvModifyQpLog(struct ibv_qp* qp, enum ibv_qp_state qpState, struct ibv_qp_attr* userAttr, int userFlag, char* msg, size_t msgLen) { + ncclResult_t res; + int portNum = -1, gidIndex = -1; + char localGidName[INET6_ADDRSTRLEN], remoteGidName[INET6_ADDRSTRLEN]; + const char *localGidRes = NULL, *remoteGidRes = NULL; + + char nextState[32], currState[32]; + ibvQpStateName(qp->state, currState, sizeof(currState)); + ibvQpStateName(qpState, nextState, sizeof(nextState)); + char devName[IBV_SYSFS_NAME_MAX] = ""; + snprintf(devName, sizeof(devName), "%s", (qp->pd->context) ? wrap_ibv_get_device_name(qp->pd->context->device) : "N/A"); + + struct ibv_qp_attr attr; + struct ibv_qp_init_attr init_attr; + int attr_mask = IBV_QP_PORT | IBV_QP_AV; + res = wrap_ibv_query_qp(qp, &attr, attr_mask, &init_attr); + struct ibv_qp_attr *qpAttr = (res == ncclSuccess) ? &attr : NULL; + + // port info, portAttr can be NULL if not given by the user and query_qp failed + struct ibv_qp_attr *portAttr = QP_ATTR(qpAttr, userAttr, userFlag, IBV_QP_PORT); + portNum = portAttr ? portAttr->port_num : -1; + + // address info, avAttr can be NULL if not given by the user and query_qp failed + struct ibv_qp_attr *avAttr = QP_ATTR(qpAttr, userAttr, userFlag, IBV_QP_AV); + if (avAttr && avAttr->ah_attr.is_global) { + union ibv_gid *remoteGid = &avAttr->ah_attr.grh.dgid; + remoteGidRes = ibvGetGidStr(remoteGid, remoteGidName, sizeof(remoteGidName)); + // we need pd->context to retrieve local GID, skip if not there + if (!qp->pd->context) goto print; + gidIndex = avAttr->ah_attr.grh.sgid_index; + union ibv_gid localGid; + NCCLCHECKGOTO(wrap_ibv_query_gid(qp->pd->context, portNum, gidIndex, &localGid), res, print); + localGidRes = ibvGetGidStr(&localGid, localGidName, sizeof(localGidName)); + } + +print: + snprintf(msg, msgLen, "on dev %s:%d, curr state %s, next state %s, local GID index %d, local GID %s, remote GID %s", + devName, portNum, currState, nextState, gidIndex, localGidRes ? localGidName : "N/A", remoteGidRes ? remoteGidName : "N/A"); + return; +} + +ncclResult_t wrap_ibv_modify_qp(struct ibv_qp* qp, struct ibv_qp_attr* attr, int attr_mask) { + char qpMsg[1024]; + int ret = 0, attempts = 0; + int maxCnt = (int)ncclParamIbMQpRetryCnt() + 1; // number of attempts = number of retry + 1 + int timeOut = (int)ncclParamIbMQpRetryTimeout(); + CHECK_NOT_NULL(ibvSymbols, ibv_internal_modify_qp); + do { + if (attempts > 0) { + unsigned int sleepTime = timeOut * attempts; + ibvModifyQpLog(qp, attr->qp_state, attr, attr_mask, qpMsg, sizeof(qpMsg)); + INFO(NCCL_NET, "Call to ibv_modify_qp failed with %d %s, %s, retrying %d/%d after %u msec of sleep", ret, strerror(ret), qpMsg, attempts, maxCnt, sleepTime); + // sleep before retrying + struct timespec tv = {.tv_sec = sleepTime / 1000, .tv_nsec = (sleepTime % 1000) * ((long)1e6)}; + nanosleep(&tv, NULL); + } + ret = ibvSymbols.ibv_internal_modify_qp(qp, attr, attr_mask); + attempts++; + } while (IBV_MQP_RETRY_ERRNO_ALL(ret) && attempts < maxCnt); + if (ret != 0) { + ibvModifyQpLog(qp, attr->qp_state, attr, attr_mask, qpMsg, sizeof(qpMsg)); + WARN("Call to ibv_modify_qp failed with %d %s, %s", ret, strerror(ret), qpMsg); + return ncclSystemError; + } + return ncclSuccess; } ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ diff --git a/src/misc/ipcsocket.cc b/src/misc/ipcsocket.cc index 2d17f47e6..23746b3c5 100644 --- a/src/misc/ipcsocket.cc +++ b/src/misc/ipcsocket.cc @@ -189,14 +189,16 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d fd %d to UDS socket %s", hdr, hdrLen, sendFd, temp); - msg.msg_control = control_un.control; - msg.msg_controllen = sizeof(control_un.control); - - cmptr = CMSG_FIRSTHDR(&msg); - cmptr->cmsg_len = CMSG_LEN(sizeof(int)); - cmptr->cmsg_level = SOL_SOCKET; - cmptr->cmsg_type = SCM_RIGHTS; - memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd)); + if (sendFd != -1) { + msg.msg_control = control_un.control; + msg.msg_controllen = sizeof(control_un.control); + + cmptr = CMSG_FIRSTHDR(&msg); + cmptr->cmsg_len = CMSG_LEN(sizeof(int)); + cmptr->cmsg_level = SOL_SOCKET; + cmptr->cmsg_type = SCM_RIGHTS; + memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd)); + } msg.msg_name = (void *)&cliaddr; msg.msg_namelen = sizeof(struct sockaddr_un); diff --git a/src/misc/nvmlwrap.cc b/src/misc/nvmlwrap.cc index f441af80b..66ba2d4c8 100644 --- a/src/misc/nvmlwrap.cc +++ b/src/misc/nvmlwrap.cc @@ -311,19 +311,19 @@ ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status) { status->CCEnabled = false; if (ccInfo.settingV12040.multiGpuMode == NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE) - status->multiGpuCCEnabled = true; + status->multiGpuProtectedPCIE = true; else - status->multiGpuCCEnabled = false; + status->multiGpuProtectedPCIE = false; } else if (pfn_nvmlSystemGetConfComputeState != NULL) { NVMLTRY(nvmlSystemGetConfComputeState, &ccInfo.settingV12020); if (ccInfo.settingV12020.ccFeature == NVML_CC_SYSTEM_FEATURE_ENABLED) status->CCEnabled = true; else status->CCEnabled = false; - status->multiGpuCCEnabled = false; + status->multiGpuProtectedPCIE = false; } else { status->CCEnabled = false; - status->multiGpuCCEnabled = false; + status->multiGpuProtectedPCIE = false; } return ncclSuccess; } diff --git a/src/misc/profiler.cc b/src/misc/profiler.cc index 9a4adf579..c9fb2a869 100644 --- a/src/misc/profiler.cc +++ b/src/misc/profiler.cc @@ -16,9 +16,110 @@ static pthread_mutex_t profilerLock = PTHREAD_MUTEX_INITIALIZER; static int profilerPluginRefCount; static void* profilerPluginLib; static ncclProfiler_t* ncclProfiler; +static ncclProfiler_v2_t ncclProfiler_v1_as_v2; +static ncclProfiler_v1_t* ncclProfiler_v1; + +static uint8_t ncclStringToFunc(const char* func) { + if (0 == strcmp(func, "AllGather")) return ncclFuncAllGather; + if (0 == strcmp(func, "AllReduce")) return ncclFuncAllReduce; + if (0 == strcmp(func, "Broadcast")) return ncclFuncBroadcast; + if (0 == strcmp(func, "Recv")) return ncclFuncRecv; + if (0 == strcmp(func, "Reduce")) return ncclFuncReduce; + if (0 == strcmp(func, "ReduceScatter")) return ncclFuncReduceScatter; + if (0 == strcmp(func, "SendRecv")) return ncclFuncSendRecv; + return ncclFuncSend; +} + +static uint8_t ncclStringToAlgo(const char* algo) { + if (0 == strcmp(algo, "TREE")) return NCCL_ALGO_TREE; + if (0 == strcmp(algo, "RING")) return NCCL_ALGO_RING; + if (0 == strcmp(algo, "COLLNET_DIRECT")) return NCCL_ALGO_COLLNET_DIRECT; + if (0 == strcmp(algo, "COLLNET_CHAIN")) return NCCL_ALGO_COLLNET_CHAIN; + if (0 == strcmp(algo, "NVLS")) return NCCL_ALGO_NVLS; + if (0 == strcmp(algo, "NVLS_TREE")) return NCCL_ALGO_NVLS_TREE; + return NCCL_ALGO_PAT; +} + +static uint8_t ncclStringToProto(const char* proto) { + if (0 == strcmp(proto, "LL")) return NCCL_PROTO_LL; + if (0 == strcmp(proto, "LL128")) return NCCL_PROTO_LL128; + return NCCL_PROTO_SIMPLE; +} + +static uint8_t ncclStringToDatatype(const char* dt) { + if (0 == strcmp(dt, "ncclInt8")) return ncclInt8; + if (0 == strcmp(dt, "ncclInt32")) return ncclInt32; + if (0 == strcmp(dt, "ncclUint32")) return ncclUint32; + if (0 == strcmp(dt, "ncclInt64")) return ncclInt64; + if (0 == strcmp(dt, "ncclUint64")) return ncclUint64; + if (0 == strcmp(dt, "ncclFloat16")) return ncclFloat16; + if (0 == strcmp(dt, "ncclFloat32")) return ncclFloat32; +#if defined(__CUDA_BF16_TYPES_EXIST__) + if (0 == strcmp(dt, "ncclBfloat16")) return ncclBfloat16; +#endif + return ncclFloat64; +} + +static ncclResult_t ncclProfiler_v1_as_v2_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr) { + ncclProfilerEventDescr_v1_t eDescr_v1 = { 0 }; + eDescr_v1.type = eDescr->type; + eDescr_v1.parentObj = eDescr->parentObj; + eDescr_v1.rank = eDescr->rank; + switch(eDescr->type) { + case ncclProfileGroup: break; + case ncclProfileColl: { + eDescr_v1.coll.name = eDescr->coll.name; + eDescr_v1.coll.commHash = eDescr->coll.commHash; + eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber; + eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func); + eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff; + eDescr_v1.coll.recvBuff = eDescr->coll.recvBuff; + eDescr_v1.coll.count = eDescr->coll.count; + eDescr_v1.coll.root = eDescr->coll.root; + eDescr_v1.coll.datatype = ncclStringToDatatype(eDescr->coll.datatype); + eDescr_v1.coll.op = 0; // removed in v2 + eDescr_v1.coll.trafficBytes = eDescr->coll.trafficBytes; + eDescr_v1.coll.nMaxChannels = eDescr->coll.nMaxChannels; + eDescr_v1.coll.nWarps = eDescr->coll.nWarps; + eDescr_v1.coll.algo = ncclStringToAlgo(eDescr->coll.algo); + eDescr_v1.coll.proto = ncclStringToProto(eDescr->coll.proto); + } break; + case ncclProfileP2p: { + eDescr_v1.p2p.name = eDescr->p2p.name; + eDescr_v1.p2p.commHash = eDescr->p2p.commHash; + eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func); + eDescr_v1.p2p.buff = eDescr->p2p.buff; + eDescr_v1.p2p.count = eDescr->p2p.count; + eDescr_v1.p2p.datatype = ncclStringToDatatype(eDescr->p2p.datatype); + eDescr_v1.p2p.peer = eDescr->p2p.peer; + } break; + case ncclProfileProxyOp: { + eDescr_v1.proxyOp.pid = eDescr->proxyOp.pid; + eDescr_v1.proxyOp.channelId = eDescr->proxyOp.channelId; + eDescr_v1.proxyOp.peer = eDescr->proxyOp.peer; + eDescr_v1.proxyOp.nSteps = eDescr->proxyOp.nSteps; + eDescr_v1.proxyOp.chunkSize = eDescr->proxyOp.chunkSize; + eDescr_v1.proxyOp.isSend = eDescr->proxyOp.isSend; + } break; + case ncclProfileProxyStep: { + eDescr_v1.proxyStep.step = eDescr->proxyStep.step; + } break; + case ncclProfileProxyCtrl: break; + default:; + } + return ncclProfiler_v1->startEvent(context, eHandle, &eDescr_v1); +} + +static ncclResult_t ncclProfiler_v1_as_v2_init(void** context, int* eActivationMask) { + ncclProfiler_v1->init(context, eActivationMask); + ncclProfiler_v1_as_v2.startEvent = ncclProfiler_v1_as_v2_startEvent; + ncclProfiler_v1_as_v2.stopEvent = ncclProfiler_v1->stopEvent; + ncclProfiler_v1_as_v2.recordEventState = ncclProfiler_v1->recordEventState; + ncclProfiler_v1_as_v2.finalize = ncclProfiler_v1->finalize; + return ncclSuccess; +} #define MAX_STR_LEN 256 -#define NCCL_PROFILER_PLUGIN_SYMBOL "ncclProfiler_v1" static void* tryOpenLib(char* name, int *err, char* errStr) { if (nullptr == name || strlen(name) == 0) { @@ -33,7 +134,7 @@ static void* tryOpenLib(char* name, int *err, char* errStr) { if (nullptr == handle) { strncpy(errStr, dlerror(), MAX_STR_LEN); errStr[MAX_STR_LEN] = 0; - if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) { + if (name && strstr(errStr, name) && strstr(errStr, "No such file or directory")) { *err = ENOENT; } } @@ -116,10 +217,21 @@ static ncclResult_t ncclProfilerPluginLoad(void) { goto fail; } - ncclProfiler = (ncclProfiler_t*)dlsym(profilerPluginLib, NCCL_PROFILER_PLUGIN_SYMBOL); + ncclProfiler = (ncclProfiler_v2_t*)dlsym(profilerPluginLib, "ncclProfiler_v2"); if (ncclProfiler == nullptr) { - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find " NCCL_PROFILER_PLUGIN_SYMBOL "."); - goto fail; + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v2."); + ncclProfiler_v1 = (ncclProfiler_v1_t*)dlsym(profilerPluginLib, "ncclProfiler_v1"); + if (ncclProfiler_v1 == nullptr) { + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v1."); + goto fail; + } else { + ncclProfiler = &ncclProfiler_v1_as_v2; + ncclProfiler_v1_as_v2.name = ncclProfiler_v1->name; + ncclProfiler_v1_as_v2.init = ncclProfiler_v1_as_v2_init; + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded ncclProfiler_v1."); + } + } else { + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded ncclProfiler_v2."); } ++profilerPluginRefCount; @@ -247,7 +359,7 @@ ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan) { eActivationMaskGroup = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED); if (__builtin_expect(ncclProfiler != NULL, 0)) { if (eActivationMaskGroup & (ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep)) { - ncclProfilerEventDescr_v1_t eDescr = { 0 }; + ncclProfilerEventDescr_t eDescr = { 0 }; eDescr.type = ncclProfileGroup; ncclProfiler->startEvent(plan->comm->profilerContext, &plan->groupEventHandle, &eDescr); } @@ -279,20 +391,17 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) { eDescr.coll.name = plan->comm->commName; eDescr.coll.commHash = plan->comm->commHash; eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func]++; - eDescr.coll.func = ct->func; + eDescr.coll.func = ncclFuncToString(ct->func); eDescr.coll.sendBuff = ct->sendbuff; eDescr.coll.recvBuff = ct->recvbuff; eDescr.coll.count = ct->count; eDescr.coll.root = ct->root; - eDescr.coll.datatype = ct->datatype; - eDescr.coll.op = ct->opHost; + eDescr.coll.datatype = ncclDatatypeToString(ct->datatype); eDescr.coll.trafficBytes = ct->trafficBytes; eDescr.coll.nMaxChannels = ct->nMaxChannels; eDescr.coll.nWarps = ct->nWarps; - eDescr.coll.algo = ct->algorithm; - eDescr.coll.proto = ct->protocol; - eDescr.coll.isCollnet = ct->isCollnet; - eDescr.coll.isNvls = ct->isNvls; + eDescr.coll.algo = ncclAlgoToString(ct->algorithm); + eDescr.coll.proto = ncclProtoToString(ct->protocol); ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr); // update collective task with group event activation mask @@ -307,10 +416,10 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) { eDescr.rank = plan->comm->rank; eDescr.p2p.name = plan->comm->commName; eDescr.p2p.commHash = plan->comm->commHash; - eDescr.p2p.func = pt->func; + eDescr.p2p.func = ncclFuncToString(pt->func); eDescr.p2p.buff = pt->buff; eDescr.p2p.count = pt->count; - eDescr.p2p.datatype = pt->datatype; + eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype); eDescr.p2p.peer = pt->root; ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr); @@ -345,6 +454,11 @@ ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) { return ncclSuccess; } +// Bellow we set the proxy descriptor step number to DIVUP(step, args->sliceSteps). +// The reason is that for some ncclOp (e.g. AllReduce) one network transfer is +// made of sliceSteps steps rather than one step. In the profiler we are still +// interested in whole network transfers though, so we account for this when +// computing the actual network step number. ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args) { TIME_START_EVENT(proxyOpStart); struct ncclProxySubArgs* sub = &args->subs[s]; @@ -354,13 +468,13 @@ ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args eDescr.type = ncclProfileProxyOp; eDescr.parentObj = sub->taskEventHandle; eDescr.rank = sub->rank; - eDescr.proxyOp.pid = args->pid; + eDescr.proxyOp.pid = sub->pid; eDescr.proxyOp.channelId = sub->channelId; eDescr.proxyOp.peer = sub->peer; - eDescr.proxyOp.nSteps = sub->nsteps; - eDescr.proxyOp.chunkSize = args->chunkSize; + eDescr.proxyOp.nSteps = DIVUP(sub->nsteps, args->sliceSteps); + eDescr.proxyOp.chunkSize = args->chunkSize * args->sliceSteps; eDescr.proxyOp.isSend = 1; - ncclProfiler->startEvent(args->profilerContext, &sub->opEventHandle, &eDescr); + ncclProfiler->startEvent(sub->profilerContext, &sub->opEventHandle, &eDescr); } } TIME_STOP_EVENT(proxyOpStart); @@ -376,13 +490,13 @@ ncclResult_t ncclProfilerStartRecvProxyOpEvent(int s, struct ncclProxyArgs* args eDescr.type = ncclProfileProxyOp; eDescr.parentObj = sub->taskEventHandle; eDescr.rank = sub->rank; - eDescr.proxyOp.pid = args->pid; + eDescr.proxyOp.pid = sub->pid; eDescr.proxyOp.channelId = sub->channelId; eDescr.proxyOp.peer = sub->peer; - eDescr.proxyOp.nSteps = sub->nsteps; - eDescr.proxyOp.chunkSize = args->chunkSize; + eDescr.proxyOp.nSteps = DIVUP(sub->nsteps, args->sliceSteps); + eDescr.proxyOp.chunkSize = args->chunkSize * args->sliceSteps; eDescr.proxyOp.isSend = 0; - ncclProfiler->startEvent(args->profilerContext, &sub->opEventHandle, &eDescr); + ncclProfiler->startEvent(sub->profilerContext, &sub->opEventHandle, &eDescr); } } TIME_STOP_EVENT(proxyOpStart); @@ -400,53 +514,50 @@ ncclResult_t ncclProfilerStopProxyOpEvent(int s, struct ncclProxyArgs* args) { return ncclSuccess; } -ncclResult_t ncclProfilerStartSendProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) { +ncclResult_t ncclProfilerStartSendProxyStepEvent(int s, struct ncclProxyArgs* args, int stepId) { TIME_START_EVENT(proxyStepStart); struct ncclProxySubArgs* sub = &args->subs[s]; if (__builtin_expect(ncclProfiler != NULL, 0)) { if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) { - for (uint64_t step = stepLo; step < stepHi; step++) { - ncclProfilerEventDescr_t eDescr = { 0 }; - eDescr.type = ncclProfileProxyStep; - eDescr.parentObj = sub->opEventHandle; - eDescr.rank = sub->rank; - eDescr.proxyStep.step = step; - ncclProfiler->startEvent(args->profilerContext, &sub->stepEventHandles[step%NCCL_STEPS], &eDescr); - } + int step_ = DIVUP(stepId, args->sliceSteps); + ncclProfilerEventDescr_t eDescr = { 0 }; + eDescr.type = ncclProfileProxyStep; + eDescr.parentObj = sub->opEventHandle; + eDescr.rank = sub->rank; + eDescr.proxyStep.step = step_; + ncclProfiler->startEvent(sub->profilerContext, &sub->stepEventHandles[step_%NCCL_STEPS], &eDescr); } } TIME_STOP_EVENT(proxyStepStart); return ncclSuccess; } -ncclResult_t ncclProfilerStartRecvProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) { +ncclResult_t ncclProfilerStartRecvProxyStepEvent(int s, struct ncclProxyArgs* args, int stepId) { TIME_START_EVENT(proxyStepStart); struct ncclProxySubArgs* sub = &args->subs[s]; if (__builtin_expect(ncclProfiler != NULL, 0)) { if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) { - for (uint64_t step = stepLo; step < stepHi; step++) { - ncclProfilerEventDescr_t eDescr = { 0 }; - eDescr.type = ncclProfileProxyStep; - eDescr.parentObj = sub->opEventHandle; - eDescr.rank = sub->rank; - eDescr.proxyStep.step = step; - ncclProfiler->startEvent(args->profilerContext, &sub->stepEventHandles[step%NCCL_STEPS], &eDescr); - } + int step_ = DIVUP(stepId, args->sliceSteps); + ncclProfilerEventDescr_t eDescr = { 0 }; + eDescr.type = ncclProfileProxyStep; + eDescr.parentObj = sub->opEventHandle; + eDescr.rank = sub->rank; + eDescr.proxyStep.step = step_; + ncclProfiler->startEvent(sub->profilerContext, &sub->stepEventHandles[step_%NCCL_STEPS], &eDescr); } } TIME_STOP_EVENT(proxyStepStart); return ncclSuccess; } -ncclResult_t ncclProfilerStopProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) { +ncclResult_t ncclProfilerStopProxyStepEvent(int s, struct ncclProxyArgs* args, int stepId) { TIME_START_EVENT(proxyStepStop); struct ncclProxySubArgs* sub = &args->subs[s]; if (__builtin_expect(ncclProfiler != NULL, 0)) { - for (uint64_t step = stepLo; step < stepHi; step++) { - if (sub->stepEventHandles[step%NCCL_STEPS]) { - ncclProfiler->stopEvent(sub->stepEventHandles[step%NCCL_STEPS]); - sub->stepEventHandles[step%NCCL_STEPS] = NULL; - } + int step_ = DIVUP(stepId, args->sliceSteps); + if (sub->stepEventHandles[step_%NCCL_STEPS]) { + ncclProfiler->stopEvent(sub->stepEventHandles[step_%NCCL_STEPS]); + sub->stepEventHandles[step_%NCCL_STEPS] = NULL; } } TIME_STOP_EVENT(proxyStepStop); @@ -484,8 +595,8 @@ ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* ar TIME_START_EVENT(proxyOpRecord); struct ncclProxySubArgs* sub = &args->subs[s]; if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) { - ncclProfilerEventStateArgs_t a = { 0 }; - a.proxyOp.steps = steps; + ncclProfilerEventStateArgs_t a = { }; + a.proxyOp.steps = DIVUP(steps, args->sliceSteps); a.proxyOp.transSize = transSize; ncclProfiler->recordEventState(sub->opEventHandle, eState, &a); } @@ -493,14 +604,13 @@ ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* ar return ncclSuccess; } -ncclResult_t ncclProfilerRecordProxyStepEventStates(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi, ncclProfilerEventState_t eState) { +ncclResult_t ncclProfilerRecordProxyStepEventState(int s, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState) { TIME_START_EVENT(proxyStepRecord); struct ncclProxySubArgs* sub = &args->subs[s]; if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) { - for (uint64_t step = stepLo; step < stepHi; step++) { - if (sub->stepEventHandles[step%NCCL_STEPS]) { - ncclProfiler->recordEventState(sub->stepEventHandles[step%NCCL_STEPS], eState, 0); - } + int step_ = DIVUP(stepId, args->sliceSteps); + if (sub->stepEventHandles[step_%NCCL_STEPS]) { + ncclProfiler->recordEventState(sub->stepEventHandles[step_%NCCL_STEPS], eState, 0); } } TIME_STOP_EVENT(proxyStepRecord); @@ -510,7 +620,7 @@ ncclResult_t ncclProfilerRecordProxyStepEventStates(int s, struct ncclProxyArgs* ncclResult_t ncclProfilerRecordProxyCtrlEventState(void* eHandle, int appended, ncclProfilerEventState_t eState) { TIME_START_EVENT(proxyCtrlRecord); if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle && __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED) & ncclProfileProxyCtrl) { - ncclProfilerEventStateArgs_t args = { 0 }; + ncclProfilerEventStateArgs_t args = { }; args.proxyCtrl.appendedProxyOps = appended; ncclProfiler->recordEventState(eHandle, eState, &args); } diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc index daf3b338d..eb9cd1015 100644 --- a/src/misc/shmutils.cc +++ b/src/misc/shmutils.cc @@ -45,7 +45,7 @@ static void shmHandleInit(int fd, char* shmPath, size_t shmSize, size_t realShmS return; } -ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle) { +ncclResult_t ncclShmOpen(char* shmPath, size_t shmPathSize, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle) { int fd = -1; char* hptr = NULL; void* dptr = NULL; @@ -62,7 +62,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de * refcount references; when the peer attaches, it should pass -1 to reduce one reference count. When it * goes down to 0, unlink should be called in order to delete shared memory file. */ if (shmPath[0] == '\0') { - sprintf(shmPath, "/dev/shm/nccl-XXXXXX"); + snprintf(shmPath, shmPathSize, "/dev/shm/nccl-XXXXXX"); retry_mkstemp: fd = mkstemp(shmPath); if (fd < 0) { @@ -70,7 +70,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de INFO(NCCL_ALL, "mkstemp: Failed to create %s, error: %s (%d) - retrying", shmPath, strerror(errno), errno); goto retry_mkstemp; } - WARN("Error: failed to create shared memory file %p, error %s (%d)", shmPath, strerror(errno), errno); + WARN("Error: failed to create shared memory file %s, error %s (%d)", shmPath, strerror(errno), errno); ret = ncclSystemError; goto fail; } diff --git a/src/misc/socket.cc b/src/misc/socket.cc index 93e577e05..dfb4e6888 100644 --- a/src/misc/socket.cc +++ b/src/misc/socket.cc @@ -12,6 +12,18 @@ #include #include #include "param.h" +#include + +NCCL_PARAM(RetryCnt, "SOCKET_RETRY_CNT", 34); +NCCL_PARAM(RetryTimeOut, "SOCKET_RETRY_SLEEP_MSEC", 100); +static void msleep(unsigned int time_msec) { + const long c_1e6 = 1e6; + struct timespec tv = (struct timespec){ + .tv_sec = time_msec / 1000, + .tv_nsec = (time_msec % 1000) * c_1e6, + }; + nanosleep(&tv, NULL); +} static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) { int bytes = 0; @@ -26,8 +38,13 @@ static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr return ncclSuccess; } if (bytes == -1) { + if ((op == NCCL_SOCKET_SEND && errno == EPIPE) || (op == NCCL_SOCKET_RECV && errno == ECONNRESET)) { + *closed = 1; + return ncclSuccess; + } if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { - WARN("socketProgressOpt: Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno)); + WARN("socketProgressOpt: Call to %s %s failed : %s", (op == NCCL_SOCKET_RECV ? "recv from" : "send to"), + ncclSocketToString(&sock->addr, line), strerror(errno)); return ncclRemoteError; } else { bytes = 0; @@ -38,17 +55,22 @@ static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr INFO(NCCL_NET, "socketProgressOpt: abort called"); return ncclInternalError; } - } while (bytes > 0 && (*offset) < size); + } while (sock->asyncFlag == 0 && bytes > 0 && (*offset) < size); return ncclSuccess; } -static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) { +static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* pclosed = NULL) { int closed; NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0 /*block*/, &closed)); if (closed) { - char line[SOCKET_NAME_MAXLEN+1]; - WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0)); - return ncclRemoteError; + if (pclosed) { + *pclosed = closed; + return ncclSuccess; + } else { + char line[SOCKET_NAME_MAXLEN+1]; + WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0)); + return ncclRemoteError; + } } return ncclSuccess; } @@ -63,9 +85,9 @@ static ncclResult_t socketWait(int op, struct ncclSocket* sock, void* ptr, int s * * Output: "IPv4/IPv6 address" */ -const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) { +const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) { if (buf == NULL || addr == NULL) return NULL; - struct sockaddr *saddr = &addr->sa; + const struct sockaddr *saddr = &addr->sa; if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; } char host[NI_MAXHOST], service[NI_MAXSERV]; /* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned. @@ -370,10 +392,9 @@ ncclResult_t ncclSocketListen(struct ncclSocket* sock) { if (socketToPort(&sock->addr)) { // Port is forced by env. Make sure we get the port. int opt = 1; -#if defined(SO_REUSEPORT) - SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt"); -#else SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt"); +#if defined(SO_REUSEPORT) + SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt"); #endif } @@ -412,6 +433,15 @@ static ncclResult_t socketTryAccept(struct ncclSocket* sock) { sock->fd = accept(sock->acceptFd, (struct sockaddr*)&sock->addr, &socklen); if (sock->fd != -1) { sock->state = ncclSocketStateAccepted; + } else if (errno == ENETDOWN || errno == EPROTO || errno == ENOPROTOOPT || errno == EHOSTDOWN || + errno == ENONET || errno == EHOSTUNREACH || errno == EOPNOTSUPP || errno == ENETUNREACH) { + /* per accept's man page, for linux sockets, the following errors might be already pending errors + * and should be considered as EAGAIN. To avoid infinite loop in case of errors, we use the retry count*/ + if (++sock->errorRetries == ncclParamRetryCnt()) { + WARN("socketTryAccept: exceeded error retry count (%d), %s", sock->errorRetries, strerror(errno)); + return ncclSystemError; + } + INFO(NCCL_ALL, "Call to accept returned %s, retrying", strerror(errno)); } else if (errno != EAGAIN && errno != EWOULDBLOCK) { WARN("socketTryAccept: Accept failed: %s", strerror(errno)); return ncclSystemError; @@ -419,72 +449,118 @@ static ncclResult_t socketTryAccept(struct ncclSocket* sock) { return ncclSuccess; } -static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) { - uint64_t magic; - enum ncclSocketType type; - int received = 0; +static ncclResult_t socketSetFlags(struct ncclSocket* sock) { const int one = 1; + /* Set socket as non-blocking if async or if we need to be able to abort */ + if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) { + int flags; + SYSCHECK(flags = fcntl(sock->fd, F_GETFL), "fcntl"); + SYSCHECK(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl"); + } SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt"); + return ncclSuccess; +} - NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received)); - if (received == 0) return ncclSuccess; - NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received)); - if (magic != sock->magic) { - WARN("socketFinalizeAccept: wrong magic %lx != %lx", magic, sock->magic); - close(sock->fd); - sock->fd = -1; - // Ignore spurious connection and accept again - sock->state = ncclSocketStateAccepting; - return ncclSuccess; - } else { - received = 0; - NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &type, sizeof(type), &received)); - if (type != sock->type) { - WARN("socketFinalizeAccept: wrong type %d != %d", type, sock->type); - sock->state = ncclSocketStateError; +static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) { + uint64_t magic; + enum ncclSocketType type; + int received; + // once accepted, linux sockets do NOT inherit file status flags such as O_NONBLOCK (BSD ones do) + NCCLCHECK(socketSetFlags(sock)); + + if (sock->asyncFlag == 0 || sock->finalizeCounter < sizeof(magic)) { + if (sock->asyncFlag == 0) { + received = 0; + NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received)); + } else { + received = sock->finalizeCounter; + NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(magic), &received)); + sock->finalizeCounter = received; + if (received < sizeof(magic)) return ncclSuccess; + memcpy(&magic, sock->finalizeBuffer, sizeof(magic)); + } + if (magic != sock->magic) { + WARN("socketFinalizeAccept: wrong magic %lx != %lx", magic, sock->magic); close(sock->fd); sock->fd = -1; - return ncclInternalError; - } else { - sock->state = ncclSocketStateReady; + // Ignore spurious connection and accept again + sock->state = ncclSocketStateAccepting; + return ncclSuccess; } } + if (sock->asyncFlag == 0) { + received = 0; + NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &type, sizeof(type), &received)); + } else { + received = sock->finalizeCounter - sizeof(magic); + NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(type), &received)); + sock->finalizeCounter = received + sizeof(magic); + if (received < sizeof(type)) return ncclSuccess; + memcpy(&type, sock->finalizeBuffer, sizeof(type)); + } + if (type != sock->type) { + WARN("socketFinalizeAccept: wrong type %d != %d", type, sock->type); + sock->state = ncclSocketStateError; + close(sock->fd); + sock->fd = -1; + return ncclInternalError; + } else { + sock->state = ncclSocketStateReady; + } return ncclSuccess; } -static ncclResult_t socketStartConnect(struct ncclSocket* sock) { - /* blocking/non-blocking connect() is determined by asyncFlag. */ - int ret = connect(sock->fd, &sock->addr.sa, sock->salen); - - if (ret == 0) { +static ncclResult_t socketResetFd(struct ncclSocket* sock) { + ncclResult_t ret = ncclSuccess; + int fd = -1; + SYSCHECKGOTO(fd = socket(sock->addr.sa.sa_family, SOCK_STREAM, 0), "socket", ret, cleanup); + // if sock->fd is valid, close it and reuse its number + if (sock->fd != -1) { + SYSCHECKGOTO(dup2(fd, sock->fd), "dup2", ret, cleanup); + SYSCHECKGOTO(close(fd), "close", ret, cleanup); + } else { + sock->fd = fd; + } + NCCLCHECKGOTO(socketSetFlags(sock), ret, exit); +exit: + return ret; +cleanup: + // cleanup fd, leave sock->fd untouched + if (fd != -1) { + (void)close(fd); + } + goto exit; +} +static ncclResult_t socketConnectCheck(struct ncclSocket* sock, int errCode, const char funcName[]) { + if (errCode == 0) { sock->state = ncclSocketStateConnected; - return ncclSuccess; - } else if (errno == EINPROGRESS) { + } else if (errCode == EINPROGRESS) { sock->state = ncclSocketStateConnectPolling; - return ncclSuccess; - } else if (errno == ECONNREFUSED) { - if (++sock->refusedRetries == RETRY_REFUSED_TIMES) { - sock->state = ncclSocketStateError; - WARN("socketStartConnect: exceeded retries (%d)", sock->refusedRetries); - return ncclRemoteError; - } - usleep(SLEEP_INT); - if (sock->refusedRetries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno)); - return ncclSuccess; - } else if (errno == ETIMEDOUT) { - if (++sock->timedOutRetries == RETRY_TIMEDOUT_TIMES) { - sock->state = ncclSocketStateError; - WARN("socketStartConnect: exceeded timeouts (%d)", sock->timedOutRetries); - return ncclRemoteError; + } else if (errCode == ETIMEDOUT || errCode == EHOSTUNREACH || errCode == ECONNREFUSED) { + if (sock->customRetry == 0) { + if (sock->errorRetries++ == ncclParamRetryCnt()) { + sock->state = ncclSocketStateError; + WARN("%s: connect returned %s, exceeded error retry count (%d)", funcName, strerror(errCode), sock->errorRetries); + return ncclRemoteError; + } + unsigned int sleepTime = sock->errorRetries * ncclParamRetryTimeOut(); + INFO(NCCL_ALL, "%s: connect returned %s, retrying (%d/%ld) after sleep for %u msec", funcName, strerror(errCode), sock->errorRetries, ncclParamRetryCnt(), sleepTime); + msleep(sleepTime); } - usleep(SLEEP_INT); - return ncclSuccess; + NCCLCHECK(socketResetFd(sock)); /* in case of failure in connect, socket state is unspecified */ + sock->state = ncclSocketStateConnecting; } else { char line[SOCKET_NAME_MAXLEN+1]; sock->state = ncclSocketStateError; - WARN("socketStartConnect: Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno)); + WARN("%s: Connect to %s failed : %s", funcName, ncclSocketToString(&sock->addr, line), strerror(errCode)); return ncclSystemError; } + return ncclSuccess; +} +static ncclResult_t socketStartConnect(struct ncclSocket* sock) { + /* blocking/non-blocking connect() is determined by asyncFlag. */ + int ret = connect(sock->fd, &sock->addr.sa, sock->salen); + return socketConnectCheck(sock, (ret == -1) ? errno : 0, __func__); } static ncclResult_t socketPollConnect(struct ncclSocket* sock) { @@ -509,33 +585,7 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) { /* check socket status */ SYSCHECK(getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt"); - - if (ret == 0) { - sock->state = ncclSocketStateConnected; - } else if (ret == ECONNREFUSED) { - if (++sock->refusedRetries == RETRY_REFUSED_TIMES) { - sock->state = ncclSocketStateError; - WARN("socketPollConnect: exceeded retries (%d)", sock->refusedRetries); - return ncclRemoteError; - } - if (sock->refusedRetries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno)); - usleep(SLEEP_INT); - sock->state = ncclSocketStateConnecting; - } else if (ret == ETIMEDOUT) { - if (++sock->timedOutRetries == RETRY_TIMEDOUT_TIMES) { - sock->state = ncclSocketStateError; - WARN("socketPollConnect: exceeded timeouts (%d)", sock->timedOutRetries); - return ncclRemoteError; - } - usleep(SLEEP_INT); - sock->state = ncclSocketStateConnecting; - } else if (ret != EINPROGRESS) { - sock->state = ncclSocketStateError; - char line[SOCKET_NAME_MAXLEN+1]; - WARN("socketPollConnect: Connect to %s returned %d(%s) errno %d(%s)", ncclSocketToString(&sock->addr, line), ret, strerror(ret), errno, strerror(errno)); - return ncclSystemError; - } - return ncclSuccess; + return socketConnectCheck(sock, ret, __func__); } ncclResult_t ncclSocketPollConnect(struct ncclSocket* sock) { @@ -548,12 +598,24 @@ ncclResult_t ncclSocketPollConnect(struct ncclSocket* sock) { } static ncclResult_t socketFinalizeConnect(struct ncclSocket* sock) { - int sent = 0; - NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent)); - if (sent == 0) return ncclSuccess; - NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent)); - sent = 0; - NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent)); + int sent; + if (sock->asyncFlag == 0) { + sent = 0; + NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent)); + sent = 0; + NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent)); + } else { + if (sock->finalizeCounter < sizeof(sock->magic)) { + sent = sock->finalizeCounter; + NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent)); + sock->finalizeCounter = sent; + if (sent < sizeof(sock->magic)) return ncclSuccess; + } + sent = sock->finalizeCounter - sizeof(sock->magic); + NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent)); + sock->finalizeCounter = sent + sizeof(sock->magic); + if (sent < sizeof(sock->type)) return ncclSuccess; + } sock->state = ncclSocketStateReady; return ncclSuccess; } @@ -598,7 +660,6 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) { #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN+1]; #endif - const int one = 1; if (sock == NULL) { WARN("ncclSocketConnect: pass NULL socket"); @@ -616,9 +677,8 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) { } TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line)); - SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt"); - sock->state = ncclSocketStateConnecting; + sock->finalizeCounter = 0; do { NCCLCHECK(socketProgressState(sock)); } while (sock->asyncFlag == 0 && @@ -664,6 +724,7 @@ ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listen memcpy(sock, listenSock, sizeof(struct ncclSocket)); sock->acceptFd = listenSock->fd; sock->state = ncclSocketStateAccepting; + sock->finalizeCounter = 0; } do { @@ -694,12 +755,11 @@ ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listen return ret; } -ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr, uint64_t magic, enum ncclSocketType type, volatile uint32_t* abortFlag, int asyncFlag) { +ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr, uint64_t magic, enum ncclSocketType type, volatile uint32_t* abortFlag, int asyncFlag, int customRetry) { ncclResult_t ret = ncclSuccess; if (sock == NULL) goto exit; - sock->timedOutRetries = 0; - sock->refusedRetries = 0; + sock->errorRetries = 0; sock->abortFlag = abortFlag; sock->asyncFlag = asyncFlag; sock->state = ncclSocketStateInitialized; @@ -707,6 +767,7 @@ ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* ad sock->type = type; sock->fd = -1; sock->acceptFd = -1; + sock->customRetry = customRetry; if (addr) { /* IPv4/IPv6 support */ @@ -718,28 +779,14 @@ ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* ad WARN("ncclSocketInit: connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)", ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6); ret = ncclInternalError; - goto fail; + goto exit; } sock->salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); - - /* Connect to a hostname / port */ - sock->fd = socket(family, SOCK_STREAM, 0); - if (sock->fd == -1) { - WARN("ncclSocketInit: Socket creation failed : %s", strerror(errno)); - ret = ncclSystemError; - goto fail; - } + // in case of error, we close the fd before returning as it's unclear if the caller has to use ncclSocketClose for cleanup + NCCLCHECKGOTO(socketResetFd(sock), ret, fail); } else { memset(&sock->addr, 0, sizeof(union ncclSocketAddress)); } - - /* Set socket as non-blocking if async or if we need to be able to abort */ - if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) { - int flags; - SYSCHECKGOTO(flags = fcntl(sock->fd, F_GETFL), "fcntl", ret, fail); - SYSCHECKGOTO(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl", ret, fail); - } - exit: return ret; fail: @@ -750,12 +797,12 @@ ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* ad goto exit; } -ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) { +ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* closed) { if (sock == NULL) { WARN("ncclSocketProgress: pass NULL socket"); return ncclInvalidArgument; } - NCCLCHECK(socketProgress(op, sock, ptr, size, offset)); + NCCLCHECK(socketProgress(op, sock, ptr, size, offset, closed)); return ncclSuccess; } @@ -788,7 +835,7 @@ ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) { WARN("ncclSocketRecv: pass NULL socket"); return ncclInvalidArgument; } - if (sock->state != ncclSocketStateReady) { + if (sock->state != ncclSocketStateReady && sock->state != ncclSocketStateTerminating) { WARN("ncclSocketRecv: socket state (%d) is not ready", sock->state); return ncclInternalError; } @@ -802,7 +849,8 @@ ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int WARN("ncclSocketSendRecv: invalid socket %p/%p", sendSock, recvSock); return ncclInternalError; } - if (sendSock->state != ncclSocketStateReady || recvSock->state != ncclSocketStateReady) { + if (sendSock->state != ncclSocketStateReady || + (recvSock->state != ncclSocketStateReady && recvSock->state != ncclSocketStateTerminating)) { WARN("ncclSocketSendRecv: socket state (%d/%d) is not ready", sendSock->state, recvSock->state); return ncclInternalError; } @@ -846,9 +894,20 @@ ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int return ncclSuccess; } -ncclResult_t ncclSocketClose(struct ncclSocket* sock) { +// Make it possible to close just one part of a socket. +ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how) { if (sock != NULL) { if (sock->fd >= 0) { + shutdown(sock->fd, how); + } + sock->state = ncclSocketStateTerminating; + } + return ncclSuccess; +} + +ncclResult_t ncclSocketClose(struct ncclSocket* sock) { + if (sock != NULL) { + if (sock->state > ncclSocketStateNone && sock->state < ncclSocketStateNum && sock->fd >= 0) { /* shutdown() is needed to send FIN packet to proxy thread; shutdown() is not affected * by refcount of fd, but close() is. close() won't close a fd and send FIN packet if * the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful diff --git a/src/misc/tuner.cc b/src/misc/tuner.cc index f1a9756f1..267e12a03 100644 --- a/src/misc/tuner.cc +++ b/src/misc/tuner.cc @@ -16,9 +16,11 @@ pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER; static int tunerPluginRefCount; static void* tunerPluginLib = nullptr; -static ncclTuner_v3_t* tunerSymbol = nullptr; +static ncclTuner_v4_t* tunerSymbol = nullptr; +static ncclTuner_v3_t* ncclTuner_v3 = nullptr; static ncclTuner_v2_t* ncclTuner_v2 = nullptr; -static ncclTuner_v3_t ncclTuner_v2_as_v3; +static ncclTuner_v4_t ncclTuner_v2_as_v4; +static ncclTuner_v4_t ncclTuner_v3_as_v4; static int hasNvlsSupport(float** collCostTable) { // Requirements for support of different algorithms: @@ -39,7 +41,20 @@ static int hasCollNetSupport(float** collCostTable) { return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1; } -static ncclResult_t ncclTuner_v2_as_v3_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int* nChannels) { +static ncclResult_t ncclTuner_v3_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int regBuff __attribute__((unused)), int* nChannels) { + NCCLCHECK(ncclTuner_v3->getCollInfo(context, collType, nBytes, numPipeOps, collCostTable, numAlgo, numProto, nChannels)); + return ncclSuccess; +} + +static ncclResult_t ncclTuner_v3_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) { + NCCLCHECK(ncclTuner_v3->init(nRanks, nNodes, logFunction, context)); + ncclTuner_v3_as_v4.name = ncclTuner_v3->name; + ncclTuner_v3_as_v4.getCollInfo = ncclTuner_v3_as_v4_getCollInfo; + ncclTuner_v3_as_v4.destroy = ncclTuner_v3->destroy; + return ncclSuccess; +} + +static ncclResult_t ncclTuner_v2_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int regBuff __attribute__((unused)), int* nChannels) { int algorithm = NCCL_ALGO_UNDEF; int protocol = NCCL_PROTO_UNDEF; int nvlsSupport = hasNvlsSupport(collCostTable); @@ -53,11 +68,11 @@ static ncclResult_t ncclTuner_v2_as_v3_getCollInfo(void* context, ncclFunc_t col return ncclSuccess; } -static ncclResult_t ncclTuner_v2_as_v3_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) { +static ncclResult_t ncclTuner_v2_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) { NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logFunction, context)); - ncclTuner_v2_as_v3.name = ncclTuner_v2->name; - ncclTuner_v2_as_v3.getCollInfo = ncclTuner_v2_as_v3_getCollInfo; - ncclTuner_v2_as_v3.destroy = ncclTuner_v2->destroy; + ncclTuner_v2_as_v4.name = ncclTuner_v2->name; + ncclTuner_v2_as_v4.getCollInfo = ncclTuner_v2_as_v4_getCollInfo; + ncclTuner_v2_as_v4.destroy = ncclTuner_v2->destroy; return ncclSuccess; } @@ -198,18 +213,26 @@ ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) { goto fail; } - tunerSymbol = (ncclTuner_v3_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v3"); + tunerSymbol = (ncclTuner_v4_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v4"); if (tunerSymbol == nullptr) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol."); - ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v2"); - if (ncclTuner_v2 == nullptr) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead."); - dlclose(tunerPluginLib); - goto fail; + INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol."); + ncclTuner_v3 = (ncclTuner_v3_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v3"); + if (ncclTuner_v3 == nullptr) { + INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol."); + ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v2"); + if (ncclTuner_v2 == nullptr) { + INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead."); + dlclose(tunerPluginLib); + goto fail; + } else { + ncclTuner_v2_as_v4.init = ncclTuner_v2_as_v4_init; + ncclTuner_v2_as_v4.name = ncclTuner_v2->name; + tunerSymbol = &ncclTuner_v2_as_v4; + } } else { - ncclTuner_v2_as_v3.init = ncclTuner_v2_as_v3_init; - ncclTuner_v2_as_v3.name = ncclTuner_v2->name; - tunerSymbol = &ncclTuner_v2_as_v3; + ncclTuner_v3_as_v4.init = ncclTuner_v3_as_v4_init; + ncclTuner_v3_as_v4.name = ncclTuner_v3->name; + tunerSymbol = &ncclTuner_v3_as_v4; } } diff --git a/src/nccl.h.in b/src/nccl.h.in index 431ecb554..8a6f94e24 100644 --- a/src/nccl.h.in +++ b/src/nccl.h.in @@ -12,6 +12,9 @@ #if CUDART_VERSION >= 11000 #include #endif +#if CUDART_VERSION >= 11080 +#include +#endif #define NCCL_MAJOR ${nccl:Major} #define NCCL_MINOR ${nccl:Minor} @@ -183,6 +186,10 @@ const char* pncclGetErrorString(ncclResult_t result); const char* ncclGetLastError(ncclComm_t comm); const char* pncclGetLastError(ncclComm_t comm); +/* Reload environment variables that determine logging. */ +void ncclResetDebugInit(); +void pncclResetDebugInit(); + /* Checks whether the comm has encountered any asynchronous errors */ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); @@ -236,12 +243,10 @@ typedef enum { ncclInt8 = 0, ncclChar = 0, ncclFloat16 = 6, ncclHalf = 6, ncclFloat32 = 7, ncclFloat = 7, ncclFloat64 = 8, ncclDouble = 8, -#if defined(__CUDA_BF16_TYPES_EXIST__) ncclBfloat16 = 9, - ncclNumTypes = 10 -#else - ncclNumTypes = 9 -#endif + ncclFloat8e4m3 = 10, + ncclFloat8e5m2 = 11, + ncclNumTypes = 12 } ncclDataType_t; /* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */ diff --git a/src/net.cc b/src/net.cc index 97a8c7381..13e8c2b51 100644 --- a/src/net.cc +++ b/src/net.cc @@ -15,20 +15,95 @@ //#include //#include -static ncclNet_v8_t ncclNet_v5_as_v8; -static ncclNet_v8_t ncclNet_v6_as_v8; -static ncclNet_v8_t ncclNet_v7_as_v8; +static ncclNet_v9_t ncclNet_v5_as_v9; +static ncclNet_v9_t ncclNet_v6_as_v9; +static ncclNet_v9_t ncclNet_v7_as_v9; +static ncclNet_v9_t ncclNet_v8_as_v9; static ncclNet_v5_t *ncclNet_v5; static ncclNet_v6_t *ncclNet_v6; static ncclNet_v7_t *ncclNet_v7; -static ncclCollNet_v8_t ncclCollNet_v5_as_v8; -static ncclCollNet_v8_t ncclCollNet_v6_as_v8; -static ncclCollNet_v8_t ncclCollNet_v7_as_v8; +static ncclNet_v8_t *ncclNet_v8; +static ncclCollNet_v9_t ncclCollNet_v5_as_v9; +static ncclCollNet_v9_t ncclCollNet_v6_as_v9; +static ncclCollNet_v9_t ncclCollNet_v7_as_v9; +static ncclCollNet_v9_t ncclCollNet_v8_as_v9; static ncclCollNet_v5_t *ncclCollNet_v5; static ncclCollNet_v6_t *ncclCollNet_v6; static ncclCollNet_v7_t *ncclCollNet_v7; +static ncclCollNet_v8_t *ncclCollNet_v8; -static ncclResult_t ncclNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) { +#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two. +#define MAX_COLLNET_SIZE (512*1024*1024L) //Set for initial collent plugins when size was not dynamically queried + +static ncclResult_t ncclNet_v8_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { + ncclNetProperties_v8_t p8; + ncclResult_t ans = ncclNet_v8->getProperties(dev, &p8); + if (ans != ncclSuccess) return ans; + props->name = p8.name; + props->pciPath = p8.pciPath; + props->guid = p8.guid; + props->ptrSupport = p8.ptrSupport; + props->regIsGlobal = p8.regIsGlobal; + props->forceFlush = 0; + props->speed = p8.speed; + props->port = p8.port; + props->maxComms = p8.maxComms; + props->maxRecvs = p8.maxRecvs; + props->latency = p8.latency; + props->netDeviceType = p8.netDeviceType; + props->netDeviceVersion = p8.netDeviceVersion; + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + props->maxP2pBytes = MAX_NET_SIZE; + props->maxCollBytes = MAX_COLLNET_SIZE; + return ncclSuccess; +} + +static ncclResult_t ncclNet_v8_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { + int sizeInt; + if (size > MAX_NET_SIZE) return ncclInternalError; + sizeInt = (int)size; + ncclResult_t ans = ncclNet_v8->isend(sendComm, data, sizeInt, tag, mhandle, request); + return ans; +} + +static ncclResult_t ncclNet_v8_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { + int sizesInt[NCCL_PROXY_MAX_SUBS]; + //reset to NULL if optional receive completion is set + if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL; + for (int i=0; i MAX_NET_SIZE) return ncclInternalError; + sizesInt[i] = (int) sizes[i]; + } + ncclResult_t ans = ncclNet_v8->irecv(recvComm, n, data, sizesInt, tags, mhandles, request); + return ans; +} + +static ncclResult_t ncclNet_v8_as_v9_init(ncclDebugLogger_t logfn) { + NCCLCHECK(ncclNet_v8->init(logfn)); + ncclNet_v8_as_v9.name = ncclNet_v8->name; + ncclNet_v8_as_v9.devices = ncclNet_v8->devices; + ncclNet_v8_as_v9.getProperties = ncclNet_v8_as_v9_getProperties; + ncclNet_v8_as_v9.listen = ncclNet_v8->listen; + ncclNet_v8_as_v9.connect = ncclNet_v8->connect; + ncclNet_v8_as_v9.accept = ncclNet_v8->accept; + ncclNet_v8_as_v9.regMr = ncclNet_v8->regMr; + ncclNet_v8_as_v9.regMrDmaBuf = ncclNet_v8->regMrDmaBuf; + ncclNet_v8_as_v9.deregMr = ncclNet_v8->deregMr; + ncclNet_v8_as_v9.isend = ncclNet_v8_as_v9_isend; + ncclNet_v8_as_v9.irecv = ncclNet_v8_as_v9_irecv; + ncclNet_v8_as_v9.iflush = ncclNet_v8->iflush; + ncclNet_v8_as_v9.test = ncclNet_v8->test; + ncclNet_v8_as_v9.closeSend = ncclNet_v8->closeSend; + ncclNet_v8_as_v9.closeRecv = ncclNet_v8->closeRecv; + ncclNet_v8_as_v9.closeListen = ncclNet_v8->closeListen; + ncclNet_v8_as_v9.getDeviceMr = ncclNet_v8->getDeviceMr; + ncclNet_v8_as_v9.irecvConsumed = ncclNet_v8->irecvConsumed; + ncclNet_v8_as_v9.makeVDevice = NULL; + return ncclSuccess; +} + +static ncclResult_t ncclNet_v7_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { ncclNetProperties_v7_t p7; ncclResult_t ans = ncclNet_v7->getProperties(dev, &p7); if (ans != ncclSuccess) return ans; @@ -37,6 +112,7 @@ static ncclResult_t ncclNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8 props->guid = p7.guid; props->ptrSupport = p7.ptrSupport; props->regIsGlobal = 0; + props->forceFlush = 0; props->speed = p7.speed; props->port = p7.port; props->maxComms = p7.maxComms; @@ -44,38 +120,63 @@ static ncclResult_t ncclNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8 props->latency = p7.latency; props->netDeviceType = p7.netDeviceType; props->netDeviceVersion = p7.netDeviceVersion; + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + props->maxP2pBytes = MAX_NET_SIZE; + props->maxCollBytes = MAX_COLLNET_SIZE; return ncclSuccess; } -static ncclResult_t ncclNet_v7_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { +static ncclResult_t ncclNet_v7_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { if (size >= 1UL<<31) return ncclInternalError; return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle); } -static ncclResult_t ncclNet_v7_as_v8_init(ncclDebugLogger_t logfn) { +static ncclResult_t ncclNet_v7_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { + int sizeInt; + if (size > MAX_NET_SIZE) return ncclInternalError; + sizeInt = (int)size; + ncclResult_t ans = ncclNet_v7->isend(sendComm, data, sizeInt, tag, mhandle, request); + return ans; +} + +static ncclResult_t ncclNet_v7_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { + int sizesInt[NCCL_PROXY_MAX_SUBS]; + //reset to NULL if optional receive completion is set + if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL; + for (int i=0; i MAX_NET_SIZE) return ncclInternalError; + sizesInt[i] = (int) sizes[i]; + } + ncclResult_t ans = ncclNet_v7->irecv(recvComm, n, data, sizesInt, tags, mhandles, request); + return ans; +} + +static ncclResult_t ncclNet_v7_as_v9_init(ncclDebugLogger_t logfn) { NCCLCHECK(ncclNet_v7->init(logfn)); - ncclNet_v7_as_v8.name = ncclNet_v7->name; - ncclNet_v7_as_v8.devices = ncclNet_v7->devices; - ncclNet_v7_as_v8.getProperties = ncclNet_v7_as_v8_getProperties; // ncclNet_v5->getProperties; - ncclNet_v7_as_v8.listen = ncclNet_v7->listen; - ncclNet_v7_as_v8.connect = ncclNet_v7->connect; - ncclNet_v7_as_v8.accept = ncclNet_v7->accept; - ncclNet_v7_as_v8.regMr = ncclNet_v7_as_v8_regMr; - ncclNet_v7_as_v8.regMrDmaBuf = ncclNet_v7->regMrDmaBuf; - ncclNet_v7_as_v8.deregMr = ncclNet_v7->deregMr; - ncclNet_v7_as_v8.isend = ncclNet_v7->isend; - ncclNet_v7_as_v8.irecv = ncclNet_v7->irecv; - ncclNet_v7_as_v8.iflush = ncclNet_v7->iflush; - ncclNet_v7_as_v8.test = ncclNet_v7->test; - ncclNet_v7_as_v8.closeSend = ncclNet_v7->closeSend; - ncclNet_v7_as_v8.closeRecv = ncclNet_v7->closeRecv; - ncclNet_v7_as_v8.closeListen = ncclNet_v7->closeListen; - ncclNet_v7_as_v8.getDeviceMr = ncclNet_v7->getDeviceMr; - ncclNet_v7_as_v8.irecvConsumed = ncclNet_v7->irecvConsumed; + ncclNet_v7_as_v9.name = ncclNet_v7->name; + ncclNet_v7_as_v9.devices = ncclNet_v7->devices; + ncclNet_v7_as_v9.getProperties = ncclNet_v7_as_v9_getProperties; // ncclNet_v5->getProperties; + ncclNet_v7_as_v9.listen = ncclNet_v7->listen; + ncclNet_v7_as_v9.connect = ncclNet_v7->connect; + ncclNet_v7_as_v9.accept = ncclNet_v7->accept; + ncclNet_v7_as_v9.regMr = ncclNet_v7_as_v9_regMr; + ncclNet_v7_as_v9.regMrDmaBuf = ncclNet_v7->regMrDmaBuf; + ncclNet_v7_as_v9.deregMr = ncclNet_v7->deregMr; + ncclNet_v7_as_v9.isend = ncclNet_v7_as_v9_isend; + ncclNet_v7_as_v9.irecv = ncclNet_v7_as_v9_irecv; + ncclNet_v7_as_v9.iflush = ncclNet_v7->iflush; + ncclNet_v7_as_v9.test = ncclNet_v7->test; + ncclNet_v7_as_v9.closeSend = ncclNet_v7->closeSend; + ncclNet_v7_as_v9.closeRecv = ncclNet_v7->closeRecv; + ncclNet_v7_as_v9.closeListen = ncclNet_v7->closeListen; + ncclNet_v7_as_v9.getDeviceMr = ncclNet_v7->getDeviceMr; + ncclNet_v7_as_v9.irecvConsumed = ncclNet_v7->irecvConsumed; + ncclNet_v7_as_v9.makeVDevice = NULL; return ncclSuccess; } -static ncclResult_t ncclNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) { +static ncclResult_t ncclNet_v6_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { ncclNetProperties_v6_t p6; ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6); if (ans != ncclSuccess) return ans; @@ -84,6 +185,7 @@ static ncclResult_t ncclNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8 props->guid = p6.guid; props->ptrSupport = p6.ptrSupport; props->regIsGlobal = 0; + props->forceFlush = 0; props->speed = p6.speed; props->port = p6.port; props->maxComms = p6.maxComms; @@ -91,46 +193,71 @@ static ncclResult_t ncclNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8 props->latency = p6.latency; props->netDeviceType = NCCL_NET_DEVICE_HOST; props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + props->maxP2pBytes = MAX_NET_SIZE; + props->maxCollBytes = MAX_COLLNET_SIZE; return ncclSuccess; } -static ncclResult_t ncclNet_v6_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { +static ncclResult_t ncclNet_v6_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { if (size >= 1UL<<31) return ncclInternalError; return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle); } -static ncclResult_t ncclNet_v6_as_v8_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { +static ncclResult_t ncclNet_v6_as_v9_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { return ncclNet_v6->connect(dev, handle, sendComm); } -static ncclResult_t ncclNet_v6_as_v8_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { +static ncclResult_t ncclNet_v6_as_v9_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { return ncclNet_v6->accept(listenComm, recvComm); } -static ncclResult_t ncclNet_v6_as_v8_init(ncclDebugLogger_t logfn) { +static ncclResult_t ncclNet_v6_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { + int sizeInt; + if (size > MAX_NET_SIZE) return ncclInternalError; + sizeInt = (int)size; + ncclResult_t ans = ncclNet_v6->isend(sendComm, data, sizeInt, tag, mhandle, request); + return ans; +} + +static ncclResult_t ncclNet_v6_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { + int sizesInt[NCCL_PROXY_MAX_SUBS]; + //reset to NULL if optional receive completion is set + if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL; + for (int i=0; i MAX_NET_SIZE) return ncclInternalError; + sizesInt[i] = (int) sizes[i]; + } + ncclResult_t ans = ncclNet_v6->irecv(recvComm, n, data, sizesInt, tags, mhandles, request); + return ans; +} + +static ncclResult_t ncclNet_v6_as_v9_init(ncclDebugLogger_t logfn) { NCCLCHECK(ncclNet_v6->init(logfn)); - ncclNet_v6_as_v8.name = ncclNet_v6->name; - ncclNet_v6_as_v8.devices = ncclNet_v6->devices; - ncclNet_v6_as_v8.getProperties = ncclNet_v6_as_v8_getProperties; // ncclNet_v5->getProperties; - ncclNet_v6_as_v8.listen = ncclNet_v6->listen; - ncclNet_v6_as_v8.connect = ncclNet_v6_as_v8_connect; - ncclNet_v6_as_v8.accept = ncclNet_v6_as_v8_accept; - ncclNet_v6_as_v8.regMr = ncclNet_v6_as_v8_regMr; - ncclNet_v6_as_v8.regMrDmaBuf = ncclNet_v6->regMrDmaBuf; - ncclNet_v6_as_v8.deregMr = ncclNet_v6->deregMr; - ncclNet_v6_as_v8.isend = ncclNet_v6->isend; - ncclNet_v6_as_v8.irecv = ncclNet_v6->irecv; - ncclNet_v6_as_v8.iflush = ncclNet_v6->iflush; - ncclNet_v6_as_v8.test = ncclNet_v6->test; - ncclNet_v6_as_v8.closeSend = ncclNet_v6->closeSend; - ncclNet_v6_as_v8.closeRecv = ncclNet_v6->closeRecv; - ncclNet_v6_as_v8.closeListen = ncclNet_v6->closeListen; - ncclNet_v6_as_v8.getDeviceMr = NULL; - ncclNet_v6_as_v8.irecvConsumed = NULL; + ncclNet_v6_as_v9.name = ncclNet_v6->name; + ncclNet_v6_as_v9.devices = ncclNet_v6->devices; + ncclNet_v6_as_v9.getProperties = ncclNet_v6_as_v9_getProperties; + ncclNet_v6_as_v9.listen = ncclNet_v6->listen; + ncclNet_v6_as_v9.connect = ncclNet_v6_as_v9_connect; + ncclNet_v6_as_v9.accept = ncclNet_v6_as_v9_accept; + ncclNet_v6_as_v9.regMr = ncclNet_v6_as_v9_regMr; + ncclNet_v6_as_v9.regMrDmaBuf = ncclNet_v6->regMrDmaBuf; + ncclNet_v6_as_v9.deregMr = ncclNet_v6->deregMr; + ncclNet_v6_as_v9.isend = ncclNet_v6_as_v9_isend; + ncclNet_v6_as_v9.irecv = ncclNet_v6_as_v9_irecv; + ncclNet_v6_as_v9.iflush = ncclNet_v6->iflush; + ncclNet_v6_as_v9.test = ncclNet_v6->test; + ncclNet_v6_as_v9.closeSend = ncclNet_v6->closeSend; + ncclNet_v6_as_v9.closeRecv = ncclNet_v6->closeRecv; + ncclNet_v6_as_v9.closeListen = ncclNet_v6->closeListen; + ncclNet_v6_as_v9.getDeviceMr = NULL; + ncclNet_v6_as_v9.irecvConsumed = NULL; + ncclNet_v6_as_v9.makeVDevice = NULL; return ncclSuccess; } -static ncclResult_t ncclNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) { +static ncclResult_t ncclNet_v5_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { ncclNetProperties_v6_t p6; ncclResult_t ans = ncclNet_v5->getProperties(dev, &p6); if (ans != ncclSuccess) return ans; @@ -139,6 +266,7 @@ static ncclResult_t ncclNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8 props->guid = p6.guid; props->ptrSupport = p6.ptrSupport; props->regIsGlobal = 0; + props->forceFlush = 0; props->speed = p6.speed; props->port = p6.port; props->maxComms = p6.maxComms; @@ -146,48 +274,73 @@ static ncclResult_t ncclNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8 props->latency = p6.latency; props->netDeviceType = NCCL_NET_DEVICE_HOST; props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + props->maxP2pBytes = MAX_NET_SIZE; + props->maxCollBytes = MAX_COLLNET_SIZE; return ncclSuccess; } -static ncclResult_t ncclNet_v5_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { +static ncclResult_t ncclNet_v5_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { if (size >= 1UL<<31) return ncclInternalError; return ncclNet_v5->regMr(comm, data, (int) size, type, mhandle); } -static ncclResult_t ncclNet_v5_as_v8_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { +static ncclResult_t ncclNet_v5_as_v9_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { return ncclNet_v5->connect(dev, handle, sendComm); } -static ncclResult_t ncclNet_v5_as_v8_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { +static ncclResult_t ncclNet_v5_as_v9_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { return ncclNet_v5->accept(listenComm, recvComm); } +static ncclResult_t ncclNet_v5_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { + int sizeInt; + if (size > MAX_NET_SIZE) return ncclInternalError; + sizeInt = (int)size; + ncclResult_t ans = ncclNet_v5->isend(sendComm, data, sizeInt, tag, mhandle, request); + return ans; +} + +static ncclResult_t ncclNet_v5_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { + int sizesInt[NCCL_PROXY_MAX_SUBS]; + //reset to NULL if optional receive completion is set + if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL; + for (int i=0; i MAX_NET_SIZE) return ncclInternalError; + sizesInt[i] = (int) sizes[i]; + } + ncclResult_t ans = ncclNet_v5->irecv(recvComm, n, data, sizesInt, tags, mhandles, request); + return ans; +} + // We use a wrapper around the v5 init to copy over the struct contents // post-init since they may not be initialized before hand. -static ncclResult_t ncclNet_v5_as_v8_init(ncclDebugLogger_t logfn) { +static ncclResult_t ncclNet_v5_as_v9_init(ncclDebugLogger_t logfn) { NCCLCHECK(ncclNet_v5->init(logfn)); - ncclNet_v5_as_v8.name = ncclNet_v5->name; - ncclNet_v5_as_v8.devices = ncclNet_v5->devices; - ncclNet_v5_as_v8.getProperties = ncclNet_v5_as_v8_getProperties; - ncclNet_v5_as_v8.listen = ncclNet_v5->listen; - ncclNet_v5_as_v8.connect = ncclNet_v5_as_v8_connect; - ncclNet_v5_as_v8.accept = ncclNet_v5_as_v8_accept; - ncclNet_v5_as_v8.regMr = ncclNet_v5_as_v8_regMr; - ncclNet_v5_as_v8.regMrDmaBuf = NULL; - ncclNet_v5_as_v8.deregMr = ncclNet_v5->deregMr; - ncclNet_v5_as_v8.isend = ncclNet_v5->isend; - ncclNet_v5_as_v8.irecv = ncclNet_v5->irecv; - ncclNet_v5_as_v8.iflush = ncclNet_v5->iflush; - ncclNet_v5_as_v8.test = ncclNet_v5->test; - ncclNet_v5_as_v8.closeSend = ncclNet_v5->closeSend; - ncclNet_v5_as_v8.closeRecv = ncclNet_v5->closeRecv; - ncclNet_v5_as_v8.closeListen = ncclNet_v5->closeListen; - ncclNet_v5_as_v8.getDeviceMr = NULL; - ncclNet_v5_as_v8.irecvConsumed = NULL; + ncclNet_v5_as_v9.name = ncclNet_v5->name; + ncclNet_v5_as_v9.devices = ncclNet_v5->devices; + ncclNet_v5_as_v9.getProperties = ncclNet_v5_as_v9_getProperties; + ncclNet_v5_as_v9.listen = ncclNet_v5->listen; + ncclNet_v5_as_v9.connect = ncclNet_v5_as_v9_connect; + ncclNet_v5_as_v9.accept = ncclNet_v5_as_v9_accept; + ncclNet_v5_as_v9.regMr = ncclNet_v5_as_v9_regMr; + ncclNet_v5_as_v9.regMrDmaBuf = NULL; + ncclNet_v5_as_v9.deregMr = ncclNet_v5->deregMr; + ncclNet_v5_as_v9.isend = ncclNet_v5_as_v9_isend; + ncclNet_v5_as_v9.irecv = ncclNet_v5_as_v9_irecv; + ncclNet_v5_as_v9.iflush = ncclNet_v5->iflush; + ncclNet_v5_as_v9.test = ncclNet_v5->test; + ncclNet_v5_as_v9.closeSend = ncclNet_v5->closeSend; + ncclNet_v5_as_v9.closeRecv = ncclNet_v5->closeRecv; + ncclNet_v5_as_v9.closeListen = ncclNet_v5->closeListen; + ncclNet_v5_as_v9.getDeviceMr = NULL; + ncclNet_v5_as_v9.irecvConsumed = NULL; + ncclNet_v5_as_v9.makeVDevice = NULL; return ncclSuccess; } -static ncclResult_t ncclCollNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) { +static ncclResult_t ncclCollNet_v5_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { ncclNetProperties_v6_t p6; ncclResult_t ans = ncclCollNet_v5->getProperties(dev, &p6); if (ans != ncclSuccess) return ans; @@ -196,6 +349,7 @@ static ncclResult_t ncclCollNet_v5_as_v8_getProperties(int dev, ncclNetPropertie props->guid = p6.guid; props->ptrSupport = p6.ptrSupport; props->regIsGlobal = 0; + props->forceFlush = 0; props->speed = p6.speed; props->port = p6.port; props->maxComms = p6.maxComms; @@ -203,38 +357,52 @@ static ncclResult_t ncclCollNet_v5_as_v8_getProperties(int dev, ncclNetPropertie props->latency = p6.latency; props->netDeviceType = NCCL_NET_DEVICE_HOST; props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + props->maxP2pBytes = MAX_NET_SIZE; + props->maxCollBytes = MAX_COLLNET_SIZE; return ncclSuccess; } -static ncclResult_t ncclCollNet_v5_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { +static ncclResult_t ncclCollNet_v5_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { if (size >= 1UL<<31) return ncclInternalError; return ncclCollNet_v5->regMr(comm, data, (int) size, type, mhandle); } +static ncclResult_t ncclCollNet_v5_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { + int countInt; + if (count > MAX_NET_SIZE) return ncclInternalError; + countInt = (int)count; + ncclResult_t ans = ncclCollNet_v5->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp, + sendMhandle, recvMhandle, request); + return ans; +} + // We use a wrapper around the v5 init to copy over the struct contents // post-init since they may not be initialized before hand. -static ncclResult_t ncclCollNet_v5_as_v8_init(ncclDebugLogger_t logfn) { +static ncclResult_t ncclCollNet_v5_as_v9_init(ncclDebugLogger_t logfn) { NCCLCHECK(ncclCollNet_v5->init(logfn)); - ncclCollNet_v5_as_v8.name = ncclCollNet_v5->name; - ncclCollNet_v5_as_v8.devices = ncclCollNet_v5->devices; - ncclCollNet_v5_as_v8.getProperties = ncclCollNet_v5_as_v8_getProperties; - ncclCollNet_v5_as_v8.listen = ncclCollNet_v5->listen; - ncclCollNet_v5_as_v8.connect = ncclCollNet_v5->connect; - ncclCollNet_v5_as_v8.reduceSupport = ncclCollNet_v5->reduceSupport; - ncclCollNet_v5_as_v8.regMr = ncclCollNet_v5_as_v8_regMr; - ncclCollNet_v5_as_v8.regMrDmaBuf = NULL; - ncclCollNet_v5_as_v8.deregMr = ncclCollNet_v5->deregMr; - ncclCollNet_v5_as_v8.iallreduce = ncclCollNet_v5->iallreduce; - ncclCollNet_v5_as_v8.iallgather = nullptr; - ncclCollNet_v5_as_v8.ireducescatter = nullptr; - ncclCollNet_v5_as_v8.iflush = ncclCollNet_v5->iflush; - ncclCollNet_v5_as_v8.test = ncclCollNet_v5->test; - ncclCollNet_v5_as_v8.closeColl = ncclCollNet_v5->closeColl; - ncclCollNet_v5_as_v8.closeListen = ncclCollNet_v5->closeListen; + ncclCollNet_v5_as_v9.name = ncclCollNet_v5->name; + ncclCollNet_v5_as_v9.devices = ncclCollNet_v5->devices; + ncclCollNet_v5_as_v9.getProperties = ncclCollNet_v5_as_v9_getProperties; + ncclCollNet_v5_as_v9.listen = ncclCollNet_v5->listen; + ncclCollNet_v5_as_v9.connect = ncclCollNet_v5->connect; + ncclCollNet_v5_as_v9.reduceSupport = ncclCollNet_v5->reduceSupport; + ncclCollNet_v5_as_v9.regMr = ncclCollNet_v5_as_v9_regMr; + ncclCollNet_v5_as_v9.regMrDmaBuf = NULL; + ncclCollNet_v5_as_v9.deregMr = ncclCollNet_v5->deregMr; + ncclCollNet_v5_as_v9.iallreduce = ncclCollNet_v5_as_v9_iallreduce; + ncclCollNet_v5_as_v9.iallgather = nullptr; + ncclCollNet_v5_as_v9.ireducescatter = nullptr; + ncclCollNet_v5_as_v9.iflush = ncclCollNet_v5->iflush; + ncclCollNet_v5_as_v9.test = ncclCollNet_v5->test; + ncclCollNet_v5_as_v9.closeColl = ncclCollNet_v5->closeColl; + ncclCollNet_v5_as_v9.closeListen = ncclCollNet_v5->closeListen; return ncclSuccess; } -static ncclResult_t ncclCollNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) { +static ncclResult_t ncclCollNet_v6_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { ncclNetProperties_v6_t p6; ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6); if (ans != ncclSuccess) return ans; @@ -243,6 +411,7 @@ static ncclResult_t ncclCollNet_v6_as_v8_getProperties(int dev, ncclNetPropertie props->guid = p6.guid; props->ptrSupport = p6.ptrSupport; props->regIsGlobal = 0; + props->forceFlush = 0; props->speed = p6.speed; props->port = p6.port; props->maxComms = p6.maxComms; @@ -250,38 +419,52 @@ static ncclResult_t ncclCollNet_v6_as_v8_getProperties(int dev, ncclNetPropertie props->latency = p6.latency; props->netDeviceType = NCCL_NET_DEVICE_HOST; props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + props->maxP2pBytes = MAX_NET_SIZE; + props->maxCollBytes = MAX_COLLNET_SIZE; return ncclSuccess; } -static ncclResult_t ncclCollNet_v6_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { +static ncclResult_t ncclCollNet_v6_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { if (size >= 1UL<<31) return ncclInternalError; return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle); } +static ncclResult_t ncclCollNet_v6_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { + int countInt; + if (count > MAX_NET_SIZE) return ncclInternalError; + countInt = (int)count; + ncclResult_t ans = ncclCollNet_v6->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp, + sendMhandle, recvMhandle, request); + return ans; +} + // We use a wrapper around the v6 init to copy over the struct contents // post-init since they may not be initialized before hand. -static ncclResult_t ncclCollNet_v6_as_v8_init(ncclDebugLogger_t logfn) { +static ncclResult_t ncclCollNet_v6_as_v9_init(ncclDebugLogger_t logfn) { NCCLCHECK(ncclCollNet_v6->init(logfn)); - ncclCollNet_v6_as_v8.name = ncclCollNet_v6->name; - ncclCollNet_v6_as_v8.devices = ncclCollNet_v6->devices; - ncclCollNet_v6_as_v8.getProperties = ncclCollNet_v6_as_v8_getProperties; - ncclCollNet_v6_as_v8.listen = ncclCollNet_v6->listen; - ncclCollNet_v6_as_v8.connect = ncclCollNet_v6->connect; - ncclCollNet_v6_as_v8.reduceSupport = ncclCollNet_v6->reduceSupport; - ncclCollNet_v6_as_v8.regMr = ncclCollNet_v6_as_v8_regMr; - ncclCollNet_v6_as_v8.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf; - ncclCollNet_v6_as_v8.deregMr = ncclCollNet_v6->deregMr; - ncclCollNet_v6_as_v8.iallreduce = ncclCollNet_v6->iallreduce; - ncclCollNet_v6_as_v8.iallgather = nullptr; - ncclCollNet_v6_as_v8.ireducescatter = nullptr; - ncclCollNet_v6_as_v8.iflush = ncclCollNet_v6->iflush; - ncclCollNet_v6_as_v8.test = ncclCollNet_v6->test; - ncclCollNet_v6_as_v8.closeColl = ncclCollNet_v6->closeColl; - ncclCollNet_v6_as_v8.closeListen = ncclCollNet_v6->closeListen; + ncclCollNet_v6_as_v9.name = ncclCollNet_v6->name; + ncclCollNet_v6_as_v9.devices = ncclCollNet_v6->devices; + ncclCollNet_v6_as_v9.getProperties = ncclCollNet_v6_as_v9_getProperties; + ncclCollNet_v6_as_v9.listen = ncclCollNet_v6->listen; + ncclCollNet_v6_as_v9.connect = ncclCollNet_v6->connect; + ncclCollNet_v6_as_v9.reduceSupport = ncclCollNet_v6->reduceSupport; + ncclCollNet_v6_as_v9.regMr = ncclCollNet_v6_as_v9_regMr; + ncclCollNet_v6_as_v9.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf; + ncclCollNet_v6_as_v9.deregMr = ncclCollNet_v6->deregMr; + ncclCollNet_v6_as_v9.iallreduce = ncclCollNet_v6_as_v9_iallreduce; + ncclCollNet_v6_as_v9.iallgather = nullptr; + ncclCollNet_v6_as_v9.ireducescatter = nullptr; + ncclCollNet_v6_as_v9.iflush = ncclCollNet_v6->iflush; + ncclCollNet_v6_as_v9.test = ncclCollNet_v6->test; + ncclCollNet_v6_as_v9.closeColl = ncclCollNet_v6->closeColl; + ncclCollNet_v6_as_v9.closeListen = ncclCollNet_v6->closeListen; return ncclSuccess; } -static ncclResult_t ncclCollNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) { +static ncclResult_t ncclCollNet_v7_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { ncclNetProperties_v7_t p7; ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7); if (ans != ncclSuccess) return ans; @@ -290,6 +473,7 @@ static ncclResult_t ncclCollNet_v7_as_v8_getProperties(int dev, ncclNetPropertie props->guid = p7.guid; props->ptrSupport = p7.ptrSupport; props->regIsGlobal = 0; + props->forceFlush = 0; props->speed = p7.speed; props->port = p7.port; props->maxComms = p7.maxComms; @@ -297,47 +481,150 @@ static ncclResult_t ncclCollNet_v7_as_v8_getProperties(int dev, ncclNetPropertie props->latency = p7.latency; props->netDeviceType = NCCL_NET_DEVICE_HOST; props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + props->maxP2pBytes = MAX_NET_SIZE; + props->maxCollBytes = MAX_COLLNET_SIZE; return ncclSuccess; } -static ncclResult_t ncclCollNet_v7_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { +static ncclResult_t ncclCollNet_v7_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { if (size >= 1UL<<31) return ncclInternalError; return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle); } +static ncclResult_t ncclCollNet_v7_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { + int countInt; + if (count > MAX_NET_SIZE) return ncclInternalError; + countInt = (int)count; + ncclResult_t ans = ncclCollNet_v7->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp, + sendMhandle, recvMhandle, request); + return ans; +} + // We use a wrapper around the v7 init to copy over the struct contents // post-init since they may not be initialized before hand. -static ncclResult_t ncclCollNet_v7_as_v8_init(ncclDebugLogger_t logfn) { +static ncclResult_t ncclCollNet_v7_as_v9_init(ncclDebugLogger_t logfn) { NCCLCHECK(ncclCollNet_v7->init(logfn)); - ncclCollNet_v7_as_v8.name = ncclCollNet_v7->name; - ncclCollNet_v7_as_v8.devices = ncclCollNet_v7->devices; - ncclCollNet_v7_as_v8.getProperties = ncclCollNet_v7_as_v8_getProperties; - ncclCollNet_v7_as_v8.listen = ncclCollNet_v7->listen; - ncclCollNet_v7_as_v8.connect = ncclCollNet_v7->connect; - ncclCollNet_v7_as_v8.reduceSupport = ncclCollNet_v7->reduceSupport; - ncclCollNet_v7_as_v8.regMr = ncclCollNet_v7_as_v8_regMr; - ncclCollNet_v7_as_v8.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf; - ncclCollNet_v7_as_v8.deregMr = ncclCollNet_v7->deregMr; - ncclCollNet_v7_as_v8.iallreduce = ncclCollNet_v7->iallreduce; - ncclCollNet_v7_as_v8.iallgather = nullptr; - ncclCollNet_v7_as_v8.ireducescatter = nullptr; - ncclCollNet_v7_as_v8.iflush = ncclCollNet_v7->iflush; - ncclCollNet_v7_as_v8.test = ncclCollNet_v7->test; - ncclCollNet_v7_as_v8.closeColl = ncclCollNet_v7->closeColl; - ncclCollNet_v7_as_v8.closeListen = ncclCollNet_v7->closeListen; + ncclCollNet_v7_as_v9.name = ncclCollNet_v7->name; + ncclCollNet_v7_as_v9.devices = ncclCollNet_v7->devices; + ncclCollNet_v7_as_v9.getProperties = ncclCollNet_v7_as_v9_getProperties; + ncclCollNet_v7_as_v9.listen = ncclCollNet_v7->listen; + ncclCollNet_v7_as_v9.connect = ncclCollNet_v7->connect; + ncclCollNet_v7_as_v9.reduceSupport = ncclCollNet_v7->reduceSupport; + ncclCollNet_v7_as_v9.regMr = ncclCollNet_v7_as_v9_regMr; + ncclCollNet_v7_as_v9.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf; + ncclCollNet_v7_as_v9.deregMr = ncclCollNet_v7->deregMr; + ncclCollNet_v7_as_v9.iallreduce = ncclCollNet_v7_as_v9_iallreduce; + ncclCollNet_v7_as_v9.iallgather = nullptr; + ncclCollNet_v7_as_v9.ireducescatter = nullptr; + ncclCollNet_v7_as_v9.iflush = ncclCollNet_v7->iflush; + ncclCollNet_v7_as_v9.test = ncclCollNet_v7->test; + ncclCollNet_v7_as_v9.closeColl = ncclCollNet_v7->closeColl; + ncclCollNet_v7_as_v9.closeListen = ncclCollNet_v7->closeListen; + return ncclSuccess; +} + +static ncclResult_t ncclCollNet_v8_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { + ncclNetProperties_v8_t p8; + ncclResult_t ans = ncclCollNet_v8->getProperties(dev, &p8); + if (ans != ncclSuccess) return ans; + props->name = p8.name; + props->pciPath = p8.pciPath; + props->guid = p8.guid; + props->ptrSupport = p8.ptrSupport; + props->regIsGlobal = p8.regIsGlobal; + props->forceFlush = 0; + props->speed = p8.speed; + props->port = p8.port; + props->maxComms = p8.maxComms; + props->maxRecvs = p8.maxRecvs; + props->latency = p8.latency; + props->netDeviceType = NCCL_NET_DEVICE_HOST; + props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + props->maxP2pBytes = MAX_NET_SIZE; + props->maxCollBytes = MAX_COLLNET_SIZE; + return ncclSuccess; +} + +static ncclResult_t ncclCollNet_v8_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { + int countInt; + if (count > MAX_NET_SIZE) return ncclInternalError; + countInt = (int)count; + ncclResult_t ans = ncclCollNet_v8->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp, + sendMhandle, recvMhandle, request); + return ans; +} + +static ncclResult_t ncclCollNet_v8_as_v9_iallgather (void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + void* sendMhandle, void** request) { + ncclNetSGE_v8_t recvPartsInt; + if (nRecvParts > 1) return ncclInternalError; + if (recvParts->size > MAX_COLLNET_SIZE) return ncclInternalError; + recvPartsInt.mhandle = recvParts->mhandle; + recvPartsInt.address = recvParts->address; + recvPartsInt.size = (int)recvParts->size; + ncclResult_t ans = ncclCollNet_v8->iallgather(collComm, sendData, nRecvParts, &recvPartsInt, + bytesPerRank, windowOffset, windowBytes, + sendMhandle, request); + return ans; +} + +static ncclResult_t ncclCollNet_v8_as_v9_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + ncclDataType_t dataType, ncclRedOp_t redOp, + void* recvMhandle, void** request) { + ncclNetSGE_v8_t sendPartsInt; + if (nSendParts > 1) return ncclInternalError; + if (sendParts->size > MAX_COLLNET_SIZE) return ncclInternalError; + sendPartsInt.mhandle = sendParts->mhandle; + sendPartsInt.address = sendParts->address; + sendPartsInt.size = (int)sendParts->size; + ncclResult_t ans = ncclCollNet_v8->ireducescatter(collComm, nSendParts, &sendPartsInt, + recvData, bytesPerRank, windowOffset, windowBytes, + dataType, redOp, + recvMhandle, request); + return ans; +} + +// We use a wrapper around the v8 init to copy over the struct contents +// post-init since they may not be initialized before hand. +static ncclResult_t ncclCollNet_v8_as_v9_init(ncclDebugLogger_t logfn) { + NCCLCHECK(ncclCollNet_v8->init(logfn)); + ncclCollNet_v8_as_v9.name = ncclCollNet_v8->name; + ncclCollNet_v8_as_v9.devices = ncclCollNet_v8->devices; + ncclCollNet_v8_as_v9.getProperties = ncclCollNet_v8_as_v9_getProperties; + ncclCollNet_v8_as_v9.listen = ncclCollNet_v8->listen; + ncclCollNet_v8_as_v9.connect = ncclCollNet_v8->connect; + ncclCollNet_v8_as_v9.reduceSupport = ncclCollNet_v8->reduceSupport; + ncclCollNet_v8_as_v9.regMr = ncclCollNet_v8->regMr; + ncclCollNet_v8_as_v9.regMrDmaBuf = ncclCollNet_v8->regMrDmaBuf; + ncclCollNet_v8_as_v9.deregMr = ncclCollNet_v8->deregMr; + ncclCollNet_v8_as_v9.iallreduce = ncclCollNet_v8_as_v9_iallreduce; + ncclCollNet_v8_as_v9.iallgather = ncclCollNet_v8_as_v9_iallgather; + ncclCollNet_v8_as_v9.ireducescatter = ncclCollNet_v8_as_v9_ireducescatter; + ncclCollNet_v8_as_v9.iflush = ncclCollNet_v8->iflush; + ncclCollNet_v8_as_v9.test = ncclCollNet_v8->test; + ncclCollNet_v8_as_v9.closeColl = ncclCollNet_v8->closeColl; + ncclCollNet_v8_as_v9.closeListen = ncclCollNet_v8->closeListen; return ncclSuccess; } static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER; -ncclNet_t* ncclNets[3] = { nullptr, &ncclNetIb, &ncclNetSocket }; -ncclCollNet_t* ncclCollNets[3] = { nullptr, nullptr, nullptr }; +ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket }; +ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr }; enum ncclNetState { ncclNetStateInit = 0, ncclNetStateEnabled = 1, ncclNetStateDisabled = 2 }; -enum ncclNetState ncclNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; -enum ncclNetState ncclCollNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; +enum ncclNetState ncclNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; +enum ncclNetState ncclCollNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; #define MAX_STR_LEN 255 @@ -443,72 +730,93 @@ ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) { goto fail; } - ncclNets[0] = (ncclNet_v8_t*)dlsym(netPluginLib, "ncclNetPlugin_v8"); + ncclNets[0] = (ncclNet_v9_t*)dlsym(netPluginLib, "ncclNetPlugin_v9"); if (ncclNets[0] == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v8 symbol."); - // Try v7 plugin - ncclNet_v7 = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7"); - if (ncclNet_v7 == nullptr) { - // Try v6 plugin - ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6"); - if (ncclNet_v6 == nullptr) { - // Try v5 plugin - ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5"); - if (ncclNet_v5 == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported."); - goto fail; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v9 symbol."); + ncclNet_v8 = (ncclNet_v8_t*)dlsym(netPluginLib, "ncclNetPlugin_v8"); + if (ncclNet_v8 == nullptr) { + // Try v7 plugin + ncclNet_v7 = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7"); + if (ncclNet_v7 == nullptr) { + // Try v6 plugin + ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6"); + if (ncclNet_v6 == nullptr) { + // Try v5 plugin + ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5"); + if (ncclNet_v5 == nullptr) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported."); + goto fail; + } else { + ncclNets[0] = &ncclNet_v5_as_v9; + ncclNet_v5_as_v9.init = ncclNet_v5_as_v9_init; + // Set the name right away to allow for NCCL_NET=... to work + ncclNet_v5_as_v9.name = ncclNet_v5->name; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name); + } } else { - ncclNets[0] = &ncclNet_v5_as_v8; - ncclNet_v5_as_v8.init = ncclNet_v5_as_v8_init; + ncclNets[0] = &ncclNet_v6_as_v9; + ncclNet_v6_as_v9.init = ncclNet_v6_as_v9_init; // Set the name right away to allow for NCCL_NET=... to work - ncclNet_v5_as_v8.name = ncclNet_v5->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name); + ncclNet_v6_as_v9.name = ncclNet_v6->name; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name); } } else { - ncclNets[0] = &ncclNet_v6_as_v8; - ncclNet_v6_as_v8.init = ncclNet_v6_as_v8_init; + ncclNets[0] = &ncclNet_v7_as_v9; + ncclNet_v7_as_v9.init = ncclNet_v7_as_v9_init; // Set the name right away to allow for NCCL_NET=... to work - ncclNet_v6_as_v8.name = ncclNet_v6->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name); + ncclNet_v7_as_v9.name = ncclNet_v7->name; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNets[0]->name); } } else { - ncclNets[0] = &ncclNet_v7_as_v8; - ncclNet_v7_as_v8.init = ncclNet_v7_as_v8_init; + ncclNets[0] = &ncclNet_v8_as_v9; + ncclNet_v8_as_v9.init = ncclNet_v8_as_v9_init; // Set the name right away to allow for NCCL_NET=... to work - ncclNet_v7_as_v8.name = ncclNet_v7->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNets[0]->name); + ncclNet_v8_as_v9.name = ncclNet_v8->name; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v8)", ncclNets[0]->name); } + } else { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v9)", ncclNets[0]->name); } // Check for CollNet - ncclCollNets[0] = (ncclCollNet_v8_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v8"); + ncclCollNets[0] = (ncclCollNet_v9_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v9"); if (ncclCollNets[0] == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol."); - ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v7"); - if (ncclCollNet_v7 == nullptr) { - ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6"); - if (ncclCollNet_v6 == nullptr) { - ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5"); - if (ncclCollNet_v5 == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported."); + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v9 symbol."); + ncclCollNet_v8 = (ncclCollNet_v8_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v8"); + if (ncclCollNet_v8 == nullptr) { + ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v7"); + if (ncclCollNet_v7 == nullptr) { + ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6"); + if (ncclCollNet_v6 == nullptr) { + ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5"); + if (ncclCollNet_v5 == nullptr) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported."); + } else { + ncclCollNets[0] = &ncclCollNet_v5_as_v9; + ncclCollNet_v5_as_v9.init = ncclCollNet_v5_as_v9_init; + ncclCollNet_v5_as_v9.name = ncclCollNet_v5->name; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v5)", ncclCollNets[0]->name); + } } else { - ncclCollNets[0] = &ncclCollNet_v5_as_v8; - ncclCollNet_v5_as_v8.init = ncclCollNet_v5_as_v8_init; - ncclCollNet_v5_as_v8.name = ncclCollNet_v5->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v5)", ncclCollNets[0]->name); + ncclCollNets[0] = &ncclCollNet_v6_as_v9; + ncclCollNet_v6_as_v9.init = ncclCollNet_v6_as_v9_init; + ncclCollNet_v6_as_v9.name = ncclCollNet_v6->name; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNets[0]->name); } } else { - ncclCollNets[0] = &ncclCollNet_v6_as_v8; - ncclCollNet_v6_as_v8.init = ncclCollNet_v6_as_v8_init; - ncclCollNet_v6_as_v8.name = ncclCollNet_v6->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNets[0]->name); + ncclCollNets[0] = &ncclCollNet_v7_as_v9; + ncclCollNet_v7_as_v9.init = ncclCollNet_v7_as_v9_init; + ncclCollNet_v7_as_v9.name = ncclCollNet_v7->name; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNets[0]->name); } } else { - ncclCollNets[0] = &ncclCollNet_v7_as_v8; - ncclCollNet_v7_as_v8.init = ncclCollNet_v7_as_v8_init; - ncclCollNet_v7_as_v8.name = ncclCollNet_v7->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNets[0]->name); + ncclCollNets[0] = &ncclCollNet_v8_as_v9; + ncclCollNet_v8_as_v9.init = ncclCollNet_v8_as_v9_init; + ncclCollNet_v8_as_v9.name = ncclCollNet_v8->name; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v8)", ncclCollNets[0]->name); } + } else { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v9)", ncclCollNets[0]->name); } ++netPluginRefCount; @@ -539,6 +847,8 @@ ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) { ncclCollNets[0] = nullptr; netPluginStatus = netPluginLoadReady; comm->netPluginLoaded = 0; + for (int i = 0; i < NCCL_NET_MAX_PLUGINS; ++i) + ncclCollNetStates[i] = ncclNetStates[i] = ncclNetStateInit; } pthread_mutex_unlock(&netPluginLock); return ncclSuccess; @@ -561,7 +871,7 @@ ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, in return ncclInternalError; } default: - WARN("Unknown device code index"); + WARN("Unknown device code index %d \n", type); return ncclInternalError; } @@ -715,8 +1025,9 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) { int ncclNetVersion(struct ncclComm* comm) { return - (comm->ncclNet == &ncclNet_v5_as_v8) ? 5 : - (comm->ncclNet == &ncclNet_v6_as_v8) ? 6 : - (comm->ncclNet == &ncclNet_v7_as_v8) ? 7 : - 8; + (comm->ncclNet == &ncclNet_v5_as_v9) ? 5 : + (comm->ncclNet == &ncclNet_v6_as_v9) ? 6 : + (comm->ncclNet == &ncclNet_v7_as_v9) ? 7 : + (comm->ncclNet == &ncclNet_v8_as_v9) ? 8 : + 9; } diff --git a/src/proxy.cc b/src/proxy.cc index 5e657c0a4..bd8188a37 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -364,7 +364,11 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr sub->channelId = op->channelId; sub->nsteps = op->nsteps; sub->nbytes = op->nbytes; + sub->chunkSize = op->chunkSize; sub->offset = 0; + sub->loopSize = op->loopSize; + sub->loopOffset = op->loopOffset; + sub->isOneRPN = op->isOneRPN; sub->peer = op->peer; sub->reg = op->reg; sub->sendMhandle = op->sendMhandle; @@ -374,8 +378,9 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr sub->eActivationMask = op->eActivationMask; sub->taskEventHandle = op->taskEventHandle; sub->rank = op->rank; - args->pid = op->pid; - args->profilerContext = op->profilerContext; + sub->pid = op->pid; + sub->profilerContext = op->profilerContext; + sub->ringAlgo = op->ringAlgo; args->nsubs = subIndex+1; if (subIndex) { if ((args->sliceSteps != op->sliceSteps) || @@ -404,6 +409,7 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr args->pattern = op->pattern; args->protocol = op->protocol; args->coll = op->coll; + args->algorithm = op->algorithm; args->specifics = op->specifics; args->state = ncclProxyOpReady; args->progress = op->connection->tcomm->proxyProgress; @@ -485,6 +491,7 @@ static ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyCon } if (op->next != -1) __builtin_prefetch(pool->ops+op->next); // Prefetch next free op memcpy(op, proxyOp, sizeof(struct ncclProxyOp)); + if (proxyOp->ringAlgo) proxyOp->ringAlgo->incRefCount(); op->next = -1; op->connection = proxyConn->connection; if (proxyOps->nextOps == -1) { @@ -601,13 +608,15 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool } break; case ncclPatternPatUp: { // Run full algorithm to count the number of steps for each peer. - int *nstepsSend, *nstepsRecv; - const int rank = comm->rank, nranks = comm->nRanks; - NCCLCHECK(ncclCalloc(&nstepsSend, log2Up(nranks))); - NCCLCHECK(ncclCalloc(&nstepsRecv, log2Up(nranks))); + ncclResult_t result = ncclSuccess; const ssize_t size = op->nbytes/comm->nRanks; - PatRSAlgorithm algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks); int last = 0; + int *nstepsSend = NULL, *nstepsRecv = NULL; + const int rank = comm->rank, nranks = comm->nRanks; + PatRSAlgorithm algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks); + NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_up); + NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_up); + while (last == 0) { int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem; size_t inpIx, outIx; @@ -619,24 +628,30 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool if (nstepsSend[i]) { int sendPeer = (rank + (1<nsteps = nstepsSend[i]; - NCCLCHECK(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire)); + NCCLCHECKGOTO(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire), result, exit_pat_up); } if (nstepsRecv[i]) { int recvPeer = (rank - (1<nsteps = nstepsRecv[i]; - NCCLCHECK(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire)); + NCCLCHECKGOTO(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire), result, exit_pat_up); } } + exit_pat_up: + free(nstepsSend); + free(nstepsRecv); + NCCLCHECK(result); } break; case ncclPatternPatDown: { // Run full algorithm to count the number of steps for each peer. - int *nstepsSend, *nstepsRecv; - const int rank = comm->rank, nranks = comm->nRanks; - NCCLCHECK(ncclCalloc(&nstepsSend, log2Up(nranks))); - NCCLCHECK(ncclCalloc(&nstepsRecv, log2Up(nranks))); + ncclResult_t result = ncclSuccess; const ssize_t size = op->nbytes/comm->nRanks; - PatAGAlgorithm algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks); int last = 0; + int *nstepsSend = NULL, *nstepsRecv = NULL; + const int rank = comm->rank, nranks = comm->nRanks; + PatAGAlgorithm algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks); + NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_down); + NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_down); + while (last == 0) { int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem; size_t inpIx, outIx; @@ -648,14 +663,18 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool if (nstepsSend[i]) { int sendPeer = (rank - (1<nsteps = nstepsSend[i]; - NCCLCHECK(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire)); + NCCLCHECKGOTO(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire), result, exit_pat_down); } if (nstepsRecv[i]) { int recvPeer = (rank + (1<nsteps = nstepsRecv[i]; - NCCLCHECK(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire)); + NCCLCHECKGOTO(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire), result, exit_pat_down); } } + exit_pat_down: + free(nstepsSend); + free(nstepsRecv); + NCCLCHECK(result); } break; case ncclPatternSend: case ncclPatternRecv: { @@ -735,23 +754,17 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int if (state->active == NULL) { pthread_mutex_lock(&pool->mutex); - while (pool->nextOps == -1 && !state->stop) { + if (pool->nextOps == -1 && !state->stop) { ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle); ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlSleep); pthread_cond_wait(&pool->cond, &pool->mutex); ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlWakeup); ncclProfilerStopProxyCtrlEvent(eHandle); } - if (state->stop) { // We might have been woken up to stop. - pthread_mutex_unlock(&pool->mutex); - return ncclSuccess; - } } - state->nextOps = pool->nextOps; pool->nextOps = pool->nextOpsEnd = -1; pthread_mutex_unlock(&pool->mutex); - if (state->nextOps == -1) return ncclInternalError; process_nextops: ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle); @@ -889,7 +902,7 @@ void* ncclProxyProgress(void *proxyState_) { * ncclParamProgressAppendOpFreq(). If they are equal, we will append proxy ops. This will decrease the * frequency of calling ncclProxyGetPostedOps() and reduce the perf impact. */ int proxyOpAppendCounter = 0; - while (state->stop == 0 || (state->stop == 1 && state->active)) { + do { int idle = 1; ncclResult_t ret = progressOps(proxyState, state, state->active, &idle); if (ret != ncclSuccess) { @@ -902,12 +915,11 @@ void* ncclProxyProgress(void *proxyState_) { if (lastIdle == 0 && idle == 1) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlIdle); if (lastIdle == 1 && idle == 0) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlActive); ncclProfilerStopProxyCtrlEvent(eHandle); - if (idle || (++proxyOpAppendCounter == ncclParamProgressAppendOpFreq())) { + if (idle || !state->active || (++proxyOpAppendCounter == ncclParamProgressAppendOpFreq())) { int added = 0; proxyOpAppendCounter = 0; TIME_START(3); - if (state->stop == 0) - ret = ncclProxyGetPostedOps(proxyState, &added); + ret = ncclProxyGetPostedOps(proxyState, &added); if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); } if (ret != ncclSuccess) { __atomic_store_n(&proxyState->asyncResult, ret, __ATOMIC_RELEASE); @@ -918,7 +930,7 @@ void* ncclProxyProgress(void *proxyState_) { } } lastIdle = idle; - } + } while (state->stop == 0 || (state->stop == 1 && state->active)); return NULL; } @@ -1090,7 +1102,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in strncpy(poolPath+sizeof("/dev/shm/nccl-")-1, resp.devShmPath, sizeof("XXXXXX")-1); struct ncclProxyOps* proxyOps = sharedProxyState->proxyOps + proxyConn->tpLocalRank; if (proxyOps->pool == NULL) { - NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, -1, &proxyOps->handle)); + NCCLCHECK(ncclShmOpen(poolPath, sizeof(poolPath), sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, -1, &proxyOps->handle)); proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1; } } @@ -1293,7 +1305,7 @@ static ncclResult_t proxyProgressInit(struct ncclProxyState* proxyState) { char shmPath[sizeof("/dev/shm/nccl-XXXXXX")]; shmPath[0] = '\0'; - NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, proxyState->tpLocalnRanks, &state->handle)); + NCCLCHECK(ncclShmOpen(shmPath, sizeof(shmPath), size, (void**)&pool, NULL, proxyState->tpLocalnRanks, &state->handle)); // Init pool pool->nextOps = -1; @@ -1372,7 +1384,7 @@ static ncclResult_t proxyQueryFd(struct ncclProxyState* proxyState, int rank, vo ncclResult_t ret = ncclSuccess; NCCLCHECKGOTO(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag), ret, exit); - NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &rmtFd, sizeof(int), rmtFd, rank, hash), ret, exit); + NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &rmtFd, sizeof(int), -1, rank, hash), ret, exit); exit: NCCLCHECK(ncclIpcSocketClose(&ipcSock)); return ncclSuccess; @@ -1603,7 +1615,7 @@ void* ncclProxyService(void* _args) { if (pollfds[s].fd == -1) continue; // Progress all ops for this ncclProxyLocalPeer - if (stop == PROXY_ABORT && ncclCuMemEnable() && ncclCuMemHostEnable() && !proxyState->directMode) closeConn = 1; + if (stop == PROXY_ABORT && ncclCuMemEnable() && ncclCuMemHostEnable() && !proxyState->directMode && __atomic_load_n(&proxyState->stop, __ATOMIC_ACQUIRE)) closeConn = 1; ncclProxyAsyncOp* op = peer->asyncOps; while (op != nullptr) { ncclProxyAsyncOp* opnext = op->next; /* in case op is freed in proxyProgressAsync */ @@ -1692,11 +1704,17 @@ static ncclResult_t proxyUDSRecvReq(struct ncclProxyState* proxyState, int reqFd NCCLCHECK(ncclIpcSocketRecvMsg(&proxyState->ipcSock, &hdr, sizeof(hdr), &rmtFd)); if (hdr.type == ncclProxyMsgGetFd) { - // cuMem API support + // cuMem API support for non-UB case, and rmtFd is not used since UDS proxy thread need to export + // fd from handle and send it back to the main thread to import the buffer. We just need to close + // this dummy rmtFd. uint64_t handle = *(uint64_t*)hdr.data; INFO(NCCL_PROXY, "proxyUDSRecvReq::ncclProxyMsgGetFd rank %d opId %p handle=0x%lx", hdr.rank, hdr.opId, handle); + close(rmtFd); return proxyGetFd(proxyState, hdr.rank, hdr.opId, handle); } else if (hdr.type == ncclProxyMsgQueryFd) { + // remote main thread registers buffer into this rank, it querys rmtFd of this rank through UDS + // and the rmtFd is returned unchanged back to remote main thread which will use rmtFd to call into + // proxy service thread for buffer registration. INFO(NCCL_PROXY, "proxyUDSRecvReq::proxyQueryFd rank %d opId %p rmtFd %d", hdr.rank, hdr.opId, rmtFd); return proxyQueryFd(proxyState, hdr.rank, hdr.opId, rmtFd); } @@ -1743,7 +1761,7 @@ void* ncclProxyServiceUDS(void* _args) { } } - ncclIpcSocketClose(&proxyState->ipcSock); + (void)ncclIpcSocketClose(&proxyState->ipcSock); INFO(NCCL_PROXY, "[Proxy Service UDS] exit: stop %d abortFlag %d", proxyState->stop, *proxyState->abortFlag); return NULL; } @@ -1800,15 +1818,10 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) { struct ncclProxyState* sharedProxyState = comm->proxyState; if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) { - if (comm->proxyState->threadUDS) { - // UDS support - __atomic_store_n(&comm->proxyState->stop, 1, __ATOMIC_RELEASE); - } - if (*comm->abortFlag == 0 && sharedProxyState->peerAddresses) { struct ncclSocket sock; int type = ncclProxyMsgStop; - ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag); + NCCLCHECK(ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag)); if (ncclSocketConnect(&sock) == ncclSuccess) { (void)ncclSocketSend(&sock, &type, sizeof(int)); } @@ -1835,6 +1848,8 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) { } } } + // Now we notify proxy service and UDS thread to exit. + __atomic_store_n(&comm->proxyState->stop, 1, __ATOMIC_RELEASE); } } diff --git a/src/ras/client.cc b/src/ras/client.cc new file mode 100644 index 000000000..8061cef4e --- /dev/null +++ b/src/ras/client.cc @@ -0,0 +1,318 @@ +/************************************************************************* + * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nccl.h" +#define NCCL_RAS_CLIENT // Only pull client-specific definitions from the header file below. +#include "ras_internal.h" + +#define STR2(v) #v +#define STR(v) STR2(v) + +// Local timeout increment compared to the '-t' argument, in seconds. +#define TIMEOUT_INCREMENT 1 + +static const char* hostName = "localhost"; +static const char* port = STR(NCCL_RAS_CLIENT_PORT); +static int timeout = -1; +static bool verbose = false; +static int sock = -1; + +static void printUsage(const char* argv0) { + fprintf(stderr, + "Usage: %s [OPTION]...\n" + "Query the state of a running NCCL job.\n" + "\nOptions:\n" + " -h, --host=HOST Host name or IP address of the RAS client socket of the\n" + " NCCL job to connect to (localhost by default)\n" + " -p, --port=PORT TCP port of the RAS client socket of the NCCL job\n" + " (" STR(NCCL_RAS_CLIENT_PORT) " by default)\n" + " -t, --timeout=SECS Maximum time for the local NCCL process to wait for\n" + " responses from other NCCL processes\n" + " (" STR(RAS_COLLECTIVE_LEG_TIMEOUT_SEC) " secs by default; 0 disables the timeout)\n" + " -v, --verbose Increase the verbosity level of the RAS output\n" + " --help Print this help and exit\n" + " --version Print the version number and exit\n", argv0); +} + +static void parseArgs(int argc, char** argv) { + int c; + int optIdx = 0; + struct option longOpts[] = { + {"host", required_argument, NULL, 'h'}, + {"port", required_argument, NULL, 'p'}, + {"timeout", required_argument, NULL, 't'}, + {"verbose", no_argument, NULL, 'v'}, + {"help", no_argument, NULL, 'e'}, + {"version", no_argument, NULL, 'r'}, + {0} + }; + + while ((c = getopt_long(argc, argv, "h:p:t:v", longOpts, &optIdx)) != -1) { + switch (c) { + case 'h': + hostName = optarg; + break; + case 'p': + port = optarg; + break; + case 't': { + char* endPtr = nullptr; + timeout = strtol(optarg, &endPtr, 10); + if (timeout < 0 || !endPtr || *endPtr != '\0') { + fprintf(stderr, "Invalid timeout: %s\n", optarg); + exit(1); + } + break; + } + case 'v': + verbose = true; + break; + case 'e': + printUsage(argv[0]); + exit(0); + case 'r': + fprintf(stderr, "NCCL RAS client version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." + STR(NCCL_PATCH) NCCL_SUFFIX "\n"); + exit(0); + default: + printUsage(argv[0]); + exit(1); + } + } +} + +static ssize_t socketWrite(int fd, const void* buf, size_t count) { + size_t done = 0; + do { + ssize_t ret; + ret = write(fd, ((const char*)buf)+done, count-done); + if (ret == -1) { + if (errno != EINTR) + return -1; + continue; + } + done += ret; + } while (done < count); + + return done; +} + +// Reads a message from RAS. Assumes that the message ends with '\n' (will continue reading until the terminating +// newline, unless false is passed as untilNewLine). +// Terminates the buffer with '\0'. Returns the number of bytes read (excluding the added terminating '\0'). +static ssize_t rasRead(int fd, void* buf, size_t count, bool untilNewline = true) { + char* bufChar = (char*)buf; + size_t done = 0; + do { + ssize_t ret; + ret = read(fd, bufChar+done, count-1-done); + if (ret == -1) { + if (errno != EINTR) + return -1; + continue; + } + if (ret == 0) + break; // EOF + done += ret; + } while (untilNewline && (done == 0 || bufChar[done-1] != '\n')); + bufChar[done] = '\0'; + + return done; +} + +static int connectToNCCL() { + struct addrinfo hints = {0}; + struct addrinfo* addrInfo = nullptr; + int ret; + char msgBuf[1024]; + int bytes; + struct timeval tv = {TIMEOUT_INCREMENT, 0}; + +retry: + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + if ((ret = getaddrinfo(hostName, port, &hints, &addrInfo)) != 0) { + fprintf(stderr, "Resolving %s:%s: %s\n", hostName, port, gai_strerror(ret)); + goto fail; + } + for (struct addrinfo* ai = addrInfo; ai; ai = ai->ai_next) { + char hostBuf[NI_MAXHOST], portBuf[NI_MAXSERV]; + int err; + sock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); + if (sock == -1) { + perror("socket"); + continue; + } + // Initially start with a small, 1-sec timeout to quickly eliminate non-responsive processes... + if (timeout && (setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof tv) != 0 || + setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof tv) != 0)) { + perror("setsockopt"); + // Non-fatal; fall through. + } + if (connect(sock, ai->ai_addr, ai->ai_addrlen) == 0) + break; + err = errno; + if (getnameinfo(ai->ai_addr, ai->ai_addrlen, hostBuf, sizeof(hostBuf), portBuf, sizeof(portBuf), + NI_NUMERICHOST | NI_NUMERICSERV) != 0) { + strcpy(hostBuf, hostName); + strcpy(portBuf, port); + } + fprintf(stderr, "Connecting to %s:%s: %s\n", hostBuf, portBuf, strerror(err)); + close(sock); + sock = -1; + } + freeaddrinfo(addrInfo); + addrInfo = nullptr; + + if (sock == -1) { + fprintf(stderr, "Failed to connect to the NCCL RAS service!\n" + "Please make sure that the NCCL job has the RAS service enabled and that\n" + "%s.\n", + (strcmp(hostName, "localhost") || strcmp(port, STR(NCCL_RAS_CLIENT_PORT)) ? + "the host/port arguments are correct and match NCCL_RAS_ADDR" : + "the RAS client was started on a node where the NCCL job is running")); + goto fail; + } + + // Exchange the RAS client handshake. + strcpy(msgBuf, "CLIENT PROTOCOL " STR(NCCL_RAS_CLIENT_PROTOCOL) "\n"); + if (socketWrite(sock, msgBuf, strlen(msgBuf)) != strlen(msgBuf)) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + goto timeout; + } + perror("write to socket"); + goto fail; + } + bytes = rasRead(sock, msgBuf, sizeof(msgBuf)); + if (bytes < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + goto timeout; + } + perror("read socket"); + goto fail; + } + if (bytes == 0) { + fprintf(stderr, "NCCL unexpectedly closed the connection\n"); + goto fail; + } + if (strncasecmp(msgBuf, "SERVER PROTOCOL ", strlen("SERVER PROTOCOL "))) { + fprintf(stderr, "Unexpected response from NCCL: %s\n", msgBuf); + goto fail; + } + if (strtol(msgBuf+strlen("SERVER PROTOCOL "), nullptr, 10) != NCCL_RAS_CLIENT_PROTOCOL) { + fprintf(stderr, "NCCL RAS protocol version mismatch (NCCL: %s; RAS client: %d)!\n" + "Will try to continue in spite of that...\n", msgBuf+strlen("SERVER PROTOCOL "), NCCL_RAS_CLIENT_PROTOCOL); + } + + if (timeout >= 0) { + snprintf(msgBuf, sizeof(msgBuf), "TIMEOUT %d\n", timeout); + if (socketWrite(sock, msgBuf, strlen(msgBuf)) != strlen(msgBuf)) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + goto timeout; + } + perror("write to socket"); + goto fail; + } + bytes = rasRead(sock, msgBuf, sizeof(msgBuf)); + if (bytes < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + goto timeout; + } + perror("read socket"); + goto fail; + } + if (bytes == 0) { + fprintf(stderr, "NCCL unexpectedly closed the connection\n"); + goto fail; + } + if (strcasecmp(msgBuf, "OK\n")) { + fprintf(stderr, "Unexpected response from NCCL: %s\n", msgBuf); + goto fail; + } + } + if (timeout) { + // Increase the socket timeout to accommodate NCCL timeout. + tv.tv_sec += (timeout > 0 ? timeout : RAS_COLLECTIVE_LEG_TIMEOUT_SEC) + RAS_COLLECTIVE_EXTRA_TIMEOUT_SEC; + if (setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof tv) != 0) { + perror("setsockopt"); + // Non-fatal; fall through. + } + } + + return 0; +fail: + if (addrInfo) + freeaddrinfo(addrInfo); + if (sock != -1) + (void)close(sock); + return 1; +timeout: + fprintf(stderr, "Connection timed out; retrying...\n"); + (void)close(sock); + goto retry; +} + +int getNCCLStatus() { + char msgBuf[4096]; + int bytes; + snprintf(msgBuf, sizeof(msgBuf), "%sSTATUS\n", (verbose ? "VERBOSE " : "")); + if (socketWrite(sock, msgBuf, strlen(msgBuf)) != strlen(msgBuf)) { + if (errno == EAGAIN || errno == EWOULDBLOCK) + fprintf(stderr, "Connection timed out\n"); + else + perror("write to socket"); + return 1; + } + for (;;) { + bytes = rasRead(sock, msgBuf, sizeof(msgBuf), /*untileNewLine*/false); + if (bytes < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) + fprintf(stderr, "Connection timed out\n"); + else + perror("read socket"); + return 1; + } + if (bytes == 0) // EOF + break; + if (fwrite(msgBuf, 1, bytes, stdout) != bytes) { + fprintf(stderr, "fwrite to stdout failed!\n"); + return 1; + } + if (fflush(stdout) != 0) { + perror("fflush stdout"); + return 1; + } + } + return 0; +} + +int main(int argc, char** argv) { + parseArgs(argc, argv); + + if (connectToNCCL()) + return 1; + + if (getNCCLStatus()) { + (void)close(sock); + return 1; + } + + if (close(sock) == -1) { + perror("close socket"); + return 1; + } + return 0; +} diff --git a/src/ras/client_support.cc b/src/ras/client_support.cc new file mode 100644 index 000000000..414a1ed94 --- /dev/null +++ b/src/ras/client_support.cc @@ -0,0 +1,1755 @@ +/************************************************************************* + * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#define NDEBUG // Comment out duriyng development only! +#include +#include +#include + +#include "alloc.h" +#include "checks.h" +#include "comm.h" +#include "nccl.h" +#include "utils.h" +#include "ras_internal.h" + +// Outlier count above which we don't print individual details about each of them. +#define RAS_CLIENT_DETAIL_THRESHOLD 10 +// Fraction of the count of the total above which we don't consider another set to be an outlier. +#define RAS_CLIENT_OUTLIER_FRACTION 0.25 +// Fraction of the count of the total below which a set is considered to be an outlier. +#define RAS_CLIENT_VERBOSE_OUTLIER_FRACTION 0.5 + +#define STR2(v) #v +#define STR(v) STR2(v) + +// The RAS client listening socket of this RAS thread (normally port 28028). +int rasClientListeningSocket = -1; + +// Auxiliary structure used when processing the results. Helps with statistics gathering and sorting. +struct rasValCount { + uint64_t value; // The observed value. + int count; // The number of occurences of this value in the results. + int firstIdx; // The index of the first occurence of this value in the results. +}; + +// Used in rasAuxComm below. The values are bitmasks so that they can be combined. +typedef enum { + RAS_ACS_UNKNOWN = 1, // Set if a peer did not provide info about a given communicator. + RAS_ACS_INIT = 2, + RAS_ACS_RUNNING = 4, + RAS_ACS_FINALIZE = 8, + RAS_ACS_ABORT = 16 +} rasACStatus; + +// Used in rasAuxComm below. The values are bitmasks so that they can be combined (with the exception of RAS_ACE_OK). +typedef enum { + RAS_ACE_OK = 0, + RAS_ACE_MISMATCH = 1, + RAS_ACE_ERROR = 2, + RAS_ACE_INCOMPLETE = 4 +} rasACError; + +// Auxiliary structure used when processing the results. Helps with sorting and includes additional statistics +// on the number of peers and nodes for a communicator. +struct rasAuxComm { + struct rasCollComms::comm* comm; + int nPeers; + int nNodes; + int ranksPerNodeMin; + int ranksPerNodeMax; + unsigned int status; // Bitmask of rasACStatus values. + unsigned int errors; // Bitmask of rasACError values. + uint64_t firstCollOpCount; // collOpCount of the first rank, to compare against. +}; + +// Connected RAS clients. +struct rasClient* rasClients; +int nRasClients; + +// Minimum byte count to increment the output buffer size by if it's too small. +#define RAS_OUT_INCREMENT 4096 + +// Internal buffer for storing the formatted results. +static char* rasOutBuffer = nullptr; +static int nRasOutBuffer = 0; // Does _not_ include the terminating '\0' (which _is_ present in the buffer). +static int rasOutBufferSize = 0; + +// We use them all over the place; no point in wasting the stack... +static char lineBuf[1024]; // Temporary buffer used for printing at most 10 (RAS_CLIENT_DETAIL_THRESHOLD) rank numbers + // or for printing the local GPU devices, which can't be more than 64 (NCCL_MAX_LOCAL_RANKS) + // small numbers (times two if the NVML mask is different than the CUDA mask). + // Still, 1024 should normally be plenty (verbose output may make things more difficult, + // but we do check for overflows, so it will just be trimmed). + +static ncclResult_t getNewClientEntry(struct rasClient** pClient); +static void rasClientEnqueueMsg(struct rasClient* client, char* msg, size_t msgLen); +static void rasClientTerminate(struct rasClient* client); + +static ncclResult_t rasClientRun(struct rasClient* client); +static ncclResult_t rasClientRunInit(struct rasClient* client); +static ncclResult_t rasClientRunConns(struct rasClient* client); +static ncclResult_t rasClientRunComms(struct rasClient* client); +static void rasClientBreakDownErrors(struct rasClient* client, struct rasCollComms::comm* comm, + const int* peerIdxConv, int ncclErrors[ncclNumResults], bool isAsync = false); + +static void rasOutAppend(const char* format, ...) __attribute__ ((format(printf, 1, 2))); +static void rasOutExtract(char* buffer); +static int rasOutLength(); +static void rasOutReset(); + +static int rasPeersNGpuCompare(const void* e1, const void* e2); +static int rasPeersNProcsCompare(const void* e1, const void* e2); +static int rasPeersHostPidCompare(const void* e1, const void* e2); +static int ncclSocketsHostCompare(const void* p1, const void* p2); +static int rasValCountsCompareRev(const void* p1, const void* p2); +static int rasAuxCommsCompareRev(const void* p1, const void* p2); +static int rasCommRanksPeerCompare(const void* p1, const void* p2); +static int rasCommRanksCollOpCompare(const void* p1, const void* p2); + +static const char* rasCommRankGpuToString(const struct rasCollComms::comm::rank* rank, char* buf, size_t size); +static const char* ncclErrorToString(ncclResult_t err); +static const char* ncclSocketToHost(const union ncclSocketAddress* addr, char* buf, size_t size); +static bool rasCountIsOutlier(int count, bool verbose, int totalCount = -1); + + +/////////////////////////////////// +// General rasClients functions. // +/////////////////////////////////// + +// Creates a listening socket for clients to connect to. +ncclResult_t rasClientInitSocket() { + ncclResult_t ret = ncclSuccess; + const char* clientAddr = "localhost:" STR(NCCL_RAS_CLIENT_PORT); + union ncclSocketAddress addr; + const int opt = 1; + if (const char* env = ncclGetEnv("NCCL_RAS_ADDR")) + clientAddr = env; + NCCLCHECKGOTO(ncclSocketGetAddrFromString(&addr, clientAddr), ret, fail); + SYSCHECKGOTO(rasClientListeningSocket = socket(addr.sa.sa_family, SOCK_STREAM, 0), "socket", ret, fail); + SYSCHECKGOTO(setsockopt(rasClientListeningSocket, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), + "setsockopt", ret, fail); +#if defined(SO_REUSEPORT) + SYSCHECKGOTO(setsockopt(rasClientListeningSocket, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)), + "setsockopt", ret, fail); +#endif + SYSCHECKGOTO(bind(rasClientListeningSocket, &addr.sa, (addr.sa.sa_family == AF_INET ? sizeof(struct sockaddr_in) : + sizeof(struct sockaddr_in6))), "bind", ret, fail); + SYSCHECKGOTO(listen(rasClientListeningSocket, 16384), "listen", ret, fail); + INFO(NCCL_INIT|NCCL_RAS, "RAS client listening socket at %s", ncclSocketToString(&addr, rasLine)); +exit: + return ret; +fail: + INFO(NCCL_INIT|NCCL_RAS, "RAS failed to establish a client listening socket at %s", clientAddr); + if (rasClientListeningSocket != -1) { + (void)close(rasClientListeningSocket); + rasClientListeningSocket = -1; + } + goto exit; +} + +// Accepts a new RAS client connection. The acceptance process may need to continue in the main event loop. +ncclResult_t rasClientAcceptNewSocket() { + ncclResult_t ret = ncclSuccess; + struct rasClient* client = nullptr; + union ncclSocketAddress addr; + socklen_t addrlen = sizeof(addr); + int flags; + + NCCLCHECKGOTO(getNewClientEntry(&client), ret, fail); + + SYSCHECKGOTO(client->sock = accept(rasClientListeningSocket, (struct sockaddr*)&addr, &addrlen), "accept", ret, fail); + + SYSCHECKGOTO(flags = fcntl(client->sock, F_GETFL), "fcntl", ret, fail); + SYSCHECKGOTO(fcntl(client->sock, F_SETFL, flags | O_NONBLOCK), "fcntl", ret, fail); + + NCCLCHECKGOTO(rasGetNewPollEntry(&client->pfd), ret, fail); + rasPfds[client->pfd].fd = client->sock; + rasPfds[client->pfd].events = POLLIN; + client->status = RAS_CLIENT_CONNECTED; +exit: + return ret; +fail: + if (client && client->sock != -1) + (void)close(client->sock); + goto exit; +} + +// Returns the index of the first available entry in the rasClients array, enlarging the array if necessary. +static ncclResult_t getNewClientEntry(struct rasClient** pClient) { + struct rasClient* client; + int i; + for (i = 0; i < nRasClients; i++) + if (rasClients[i].status == RAS_CLIENT_CLOSED) + break; + if (i == nRasClients) { + NCCLCHECK(ncclRealloc(&rasClients, nRasClients, nRasClients+RAS_INCREMENT)); + nRasClients += RAS_INCREMENT; + } + + client = rasClients+i; + memset(client, '\0', sizeof(*client)); + client->sock = client->pfd = -1; + ncclIntruQueueConstruct(&client->sendQ); + client->timeout = RAS_COLLECTIVE_LEG_TIMEOUT; + client->collIdx = -1; + + *pClient = client; + return ncclSuccess; +} + +// Allocates a message of the desired length for sending. +// Behind the scenes uses rasMsgAlloc. +// Must use rasClientFreeMsg to free. +static ncclResult_t rasClientAllocMsg(char** msg, size_t msgLen) { + return rasMsgAlloc((struct rasMsg**)msg, msgLen); +} + +// To be used only with messages allocated with rasClientAllocMsg, i.e., for messages meant for sending. +static void rasClientFreeMsg(char* msg) { + rasMsgFree((struct rasMsg*)msg); +} + +// Enqueues a message for sending to a RAS client. The message *must* have been allocated using rasClientAllocMsg. +static void rasClientEnqueueMsg(struct rasClient* client, char* msg, size_t msgLen) { + // Get to the metadata of this message. + struct rasMsgMeta* meta = (struct rasMsgMeta*)((char*)msg - offsetof(struct rasMsgMeta, msg)); + meta->offset = 0; + meta->length = msgLen; + ncclIntruQueueEnqueue(&client->sendQ, meta); + assert(client->status != RAS_CLIENT_CLOSED && client->status < RAS_CLIENT_FINISHED); + rasPfds[client->pfd].events |= POLLOUT; +} + +// Terminates a connection with a RAS client. +static void rasClientTerminate(struct rasClient* client) { + (void)close(client->sock); + client->sock = -1; + client->status = RAS_CLIENT_CLOSED; + rasPfds[client->pfd].fd = -1; + rasPfds[client->pfd].events = rasPfds[client->pfd].revents = 0; + client->pfd = -1; + while (struct rasMsgMeta* meta = ncclIntruQueueTryDequeue(&client->sendQ)) { + free(meta); + } +} + + +////////////////////////////////////////////////////////////////////// +// Functions related to the asynchronous operations of RAS clients. // +////////////////////////////////////////////////////////////////////// + +// Invoked when an asynchronous operation that a client was waiting on completes. Finds the right client and +// reinvokes rasClientRun. +ncclResult_t rasClientResume(struct rasCollective* coll) { + int collIdx = coll-rasCollectives; + int i; + struct rasClient* client = nullptr; + for (i = 0; i < nRasClients; i++) { + client = rasClients+i; + if (client->status != RAS_CLIENT_CLOSED && client->collIdx == collIdx) { + break; + } + } + if (i == nRasClients) { + INFO(NCCL_RAS, "RAS failed to find a matching client!"); + rasCollFree(coll); + goto exit; + } + + NCCLCHECK(rasClientRun(client)); +exit: + return ncclSuccess; +} + +// Handles a ready client FD from the main event loop. +void rasClientEventLoop(int clientIdx, int pollIdx) { + struct rasClient* client = rasClients+clientIdx; + bool closed = false; + + if (client->status == RAS_CLIENT_CONNECTED) { + char* cmd; + char* cmdEnd; + if (rasPfds[pollIdx].revents & POLLIN) { + if (client->recvOffset < sizeof(client->recvBuffer)) { + ssize_t nRecv; + nRecv = recv(client->sock, client->recvBuffer+client->recvOffset, + sizeof(client->recvBuffer) - client->recvOffset, MSG_DONTWAIT); + if (nRecv == 0) { + closed = true; + } else if (nRecv == -1) { + if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { + if (errno == ECONNRESET) + INFO(NCCL_RAS, "RAS socket closed by the client on receive; terminating it"); + else + INFO(NCCL_RAS, "RAS unexpected error from recv; terminating the client socket"); + closed = true; + } + } else { // nRecv > 0 + client->recvOffset += nRecv; + } + } else { // client->recvOffset == sizeof(client->recvBuffer) + rasPfds[client->pfd].events &= ~POLLIN; // No room to receive for now. + } + } // if (rasPfds[pollIdx].revents & POLLIN) + if (closed) { + rasClientTerminate(client); + return; + } + cmd = client->recvBuffer; + while ((cmdEnd = (char*)memchr(cmd, '\n', client->recvOffset - (cmd-client->recvBuffer))) != nullptr) { + char* msg; + int msgLen; + *cmdEnd = '\0'; // Replaces '\n'. + if (cmdEnd > cmd && cmdEnd[-1] == '\r') + cmdEnd[-1] = '\0'; // Replaces '\r' (e.g., in case of a telnet connection). + + if (strncasecmp(cmd, "client protocol ", strlen("client protocol ")) == 0) { + // We ignore the protocol version for now; we just send our version back. + snprintf(rasLine, sizeof(rasLine), "SERVER PROTOCOL " STR(NCCL_RAS_CLIENT_PROTOCOL) "\n"); + msgLen = strlen(rasLine); + if (rasClientAllocMsg(&msg, msgLen) != ncclSuccess) { + rasClientTerminate(client); + return; + } + // We don't copy the terminating '\0', hence memcpy rather than strcpy. + memcpy(msg, rasLine, msgLen); + rasClientEnqueueMsg(client, msg, msgLen); + } else if (strncasecmp(cmd, "timeout ", strlen("timeout ")) == 0) { + char* endPtr = nullptr; + int timeout = strtol(cmd+strlen("timeout "), &endPtr, 10); + if (timeout < 0 || !endPtr || *endPtr != '\0') { + snprintf(rasLine, sizeof(rasLine), "ERROR: Invalid timeout value %s\n", cmd+strlen("timeout ")); + } else { + client->timeout = timeout * CLOCK_UNITS_PER_SEC; + strcpy(rasLine, "OK\n"); + } + msgLen = strlen(rasLine); + if (rasClientAllocMsg(&msg, msgLen) != ncclSuccess) { + rasClientTerminate(client); + return; + } + // We don't copy the terminating '\0', hence memcpy rather than strcpy. + memcpy(msg, rasLine, msgLen); + rasClientEnqueueMsg(client, msg, msgLen); + } else if (strcasecmp(cmd, "status") == 0) { + client->status = RAS_CLIENT_INIT; + (void)rasClientRun(client); + } else if (strcasecmp(cmd, "verbose status") == 0) { + client->status = RAS_CLIENT_INIT; + client->verbose = 1; + (void)rasClientRun(client); + } else { + snprintf(rasLine, sizeof(rasLine), "ERROR: Unknown command %s\n", cmd); + msgLen = strlen(rasLine); + if (rasClientAllocMsg(&msg, msgLen) != ncclSuccess) + return; // It should be non-fatal if we don't return a response... + // We don't copy the terminating '\0', hence memcpy rather than strcpy. + memcpy(msg, rasLine, msgLen); + rasClientEnqueueMsg(client, msg, msgLen); + } + + cmd = cmdEnd+1; + } // while newline found + + if (cmd == client->recvBuffer) { + if (client->recvOffset == sizeof(client->recvBuffer)) { + // We didn't find any newlines and the buffer is full. + INFO(NCCL_RAS, "RAS excessively long input line; terminating the client socket"); + rasClientTerminate(client); + return; + } + // Otherwise it's an incomplete command; we need to wait for the rest of it. + } else { // cmd > client->recvBuffer + // Shift whatever remains (if anything) to the beginning of the buffer. + memmove(client->recvBuffer, cmd, client->recvOffset - (cmd-client->recvBuffer)); + client->recvOffset -= cmd-client->recvBuffer; + } + } // if (client->status == RAS_CLIENT_CONNECTED) + + if (rasPfds[pollIdx].revents & POLLOUT) { + struct rasMsgMeta* meta; + while ((meta = ncclIntruQueueHead(&client->sendQ)) != nullptr) { + ssize_t nSend; + nSend = send(client->sock, ((char*)&meta->msg)+meta->offset, meta->length-meta->offset, + MSG_DONTWAIT | MSG_NOSIGNAL); + if (nSend < 1) { + if (nSend == -1 && errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { + if (errno == EPIPE) + INFO(NCCL_RAS, "RAS socket closed by the client on send; terminating it"); + else + INFO(NCCL_RAS, "RAS unexpected error from send; terminating the client socket"); + closed = true; + } + break; + } + + meta->offset += nSend; + if (meta->offset < meta->length) + break; + + ncclIntruQueueDequeue(&client->sendQ); + free(meta); + } // while (meta) + + if (closed) { + rasClientTerminate(client); + return; + } + + if (!meta) { + rasPfds[client->pfd].events &= ~POLLOUT; // Nothing more to send for now. + if (client->status == RAS_CLIENT_FINISHED) + rasClientTerminate(client); + } + } // if (rasPfds[pollIdx].revents & POLLOUT) +} + + +////////////////////////////////////////////////////////// +// Functions driving data gathering for the RAS client. // +////////////////////////////////////////////////////////// + +// Main function that drives the whole data gathering process and sends it back to the client. +// There are multiple asynchronous aspects of it (getting the data on connections and on communicators), so the +// function may exit early and needs to be reinvoked when the asynchronous responses arrive or the timeout expires. +// The state tracking the progress of such operations is kept in the rasClient. +static ncclResult_t rasClientRun(struct rasClient* client) { + ncclResult_t ret = ncclSuccess; + + switch (client->status) { + case RAS_CLIENT_INIT: + NCCLCHECKGOTO(rasClientRunInit(client), ret, exit); +#if 0 // Commented out for now to focus the summary status report on the information most relevant to the users. + // To be revisited with future extensions to RAS. + client->status = RAS_CLIENT_CONNS; + if (ret == ncclInProgress) { + ret = ncclSuccess; + break; + } + case RAS_CLIENT_CONNS: + assert(client->collIdx != -1); + NCCLCHECKGOTO(rasClientRunConns(client), ret, exit); +#endif + client->status = RAS_CLIENT_COMMS; + if (ret == ncclInProgress) { + ret = ncclSuccess; + break; + } + case RAS_CLIENT_COMMS: + assert(client->collIdx != -1); + NCCLCHECKGOTO(rasClientRunComms(client), ret, exit); + client->status = RAS_CLIENT_FINISHED; + break; + default: + WARN("Invalid client status %d", client->status); + ret = ncclInternalError; + goto exit; + } +exit: + return ret; +} + +// Sends to the client the initial data that can be obtained locally -- version info, stats on rasPeers, +// dump of rasDeadPeers. Initiates the RAS_COLL_CONNS collective operation. +static ncclResult_t rasClientRunInit(struct rasClient* client) { + ncclResult_t ret = ncclSuccess; + char* msg = nullptr; + int msgLen; + struct rasPeerInfo* peersReSorted = nullptr; + int totalGpus, totalNodes, firstNGpusNode, firstNGpusGlobal, firstNPeersGlobal; + bool consistentNGpusNode, consistentNGpusGlobal, consistentNPeersGlobal; + int firstIdx, nPeers; + struct rasValCount valCounts[NCCL_MAX_LOCAL_RANKS]; + int nValCounts; + static int cudaDriver = -1, cudaRuntime = -1; + + rasOutReset(); + rasOutAppend("NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX + " compiled with CUDA " STR(CUDA_MAJOR) "." STR(CUDA_MINOR) "\n"); + if (cudaRuntime == -1) + cudaRuntimeGetVersion(&cudaRuntime); + if (cudaDriver == -1) + cudaDriverGetVersion(&cudaDriver); + rasOutAppend("CUDA runtime version %d, driver version %d\n\n", cudaRuntime, cudaDriver); + msgLen = rasOutLength(); + NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail); + rasOutExtract(msg); + rasClientEnqueueMsg(client, msg, msgLen); + msg = nullptr; + + rasOutReset(); + totalGpus = totalNodes = 0; + firstNGpusNode = 0; // #GPUs on the first peer of a node. + firstNGpusGlobal = 0; // #GPUs on peerIdx 0. + consistentNGpusNode = true; // Whether #GPUs/peer is consistent between the peers *on any one node*. + consistentNGpusGlobal = true; // Whether #GPUs/peer is consistent between the peers *on all nodes*. + consistentNPeersGlobal = true; // Whether #peers/node is consistent between all nodes. + nPeers = 0; // #peers on a node. + firstNPeersGlobal = 0; + for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) { + int nGpus = __builtin_popcountll(rasPeers[peerIdx].cudaDevs); + totalGpus += nGpus; + if (peerIdx == 0) { + totalNodes = 1; + nPeers = 1; + firstNGpusGlobal = firstNGpusNode = nGpus; + } else { // peerIdx > 0 + if (nGpus != firstNGpusGlobal) + consistentNGpusGlobal = false; + if (!ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasPeers[peerIdx-1].addr)) { + totalNodes++; + if (firstNPeersGlobal == 0) + firstNPeersGlobal = nPeers; + else if (nPeers != firstNPeersGlobal) + consistentNPeersGlobal = false; + nPeers = 1; + firstNGpusNode = nGpus; + } else { // Same node. + if (nGpus != firstNGpusNode) + consistentNGpusNode = false; + nPeers++; + } // Same node + } // peerIdx > 0 + if (peerIdx == nRasPeers-1) { + if (firstNPeersGlobal == 0) + firstNPeersGlobal = nPeers; + else if (nPeers != firstNPeersGlobal) + consistentNPeersGlobal = false; + } + } // for (peerIdx) + + rasOutAppend("Job summary\n" + "===========\n\n"); + + if (consistentNGpusNode && consistentNGpusGlobal && consistentNPeersGlobal) { + rasOutAppend(" Nodes Processes GPUs Processes GPUs\n" + "(total) per node per process (total) (total)\n" + "%7d" " %9d" " %11d" " %9d" " %7d\n", + totalNodes, firstNPeersGlobal, firstNGpusGlobal, nRasPeers, totalGpus); + } else { + // Gather the stats on the number of processes per node. However, that number is not a property of a peer, + // but of a group of peers, so calculating it is more involved. We make a copy of rasPeers and creatively + // misuse it: cudaDevs of each element will be repurposed to store the number of processes on the node. + NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasPeers), ret, fail); + memcpy(peersReSorted, rasPeers, nRasPeers * sizeof(*peersReSorted)); + + firstIdx = 0; + nPeers = 0; + for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) { + if (peerIdx == 0) { + nPeers = 1; + firstIdx = 0; + } else { // peerIdx > 0 + if (!ncclSocketsSameNode(&peersReSorted[peerIdx].addr, &peersReSorted[peerIdx-1].addr)) { + for (int i = firstIdx; i < peerIdx; i++) { + // Go back and update the number of processes of all the elements of that node. + peersReSorted[i].cudaDevs = nPeers; + } + nPeers = 1; + firstIdx = peerIdx; + } else { + nPeers++; + } + } // peerIdx > 0 + if (peerIdx == nRasPeers-1) { + // Last iteration of the loop. + for (int i = firstIdx; i < nRasPeers; i++) { + peersReSorted[i].cudaDevs = nPeers; + } + } + } // for (peerIdx) + + // Re-sort it now using the number of processes on the node (cudaDevs) as the primary key, host IP as the + // secondary, and process id as the tertiary. + qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), rasPeersNProcsCompare); + + // Calculate the distribution of different numbers of peers per node. + nValCounts = 0; + for (int peerIdx = 0; peerIdx < nRasPeers;) { + if (peerIdx == 0 || peersReSorted[peerIdx].cudaDevs != peersReSorted[peerIdx-1].cudaDevs) { + valCounts[nValCounts].value = peersReSorted[peerIdx].cudaDevs; + valCounts[nValCounts].count = 1; + valCounts[nValCounts].firstIdx = peerIdx; + nValCounts++; + } else { + valCounts[nValCounts-1].count++; + } + // Advance peerIdx to the next node. + peerIdx += peersReSorted[peerIdx].cudaDevs; + } + // valCounts is currently sorted by value (the number of peers per node). Sort it by the count (most frequent + // number of peers first). + qsort(valCounts, nValCounts, sizeof(*valCounts), rasValCountsCompareRev); + + // Print it out, the most frequent peer counts first. + if (consistentNGpusNode && consistentNGpusGlobal) { + rasOutAppend(" Nodes Processes GPUs\n" + " per node per process\n"); + for (int i = 0; i < nValCounts; i++) { + struct rasValCount* vc = valCounts+i; + rasOutAppend("%7d %9ld %11d\n", + vc->count, vc->value, firstNGpusGlobal); + } + } else { + rasOutAppend(" Nodes Processes\n" + " per node\n"); + for (int i = 0; i < nValCounts; i++) { + struct rasValCount* vc = valCounts+i; + rasOutAppend("%7d %9ld\n", + vc->count, vc->value); + } + + // We calculate and print the GPUs/process separately. This is required for !consistentNGpusNode and + // it also makes our life easier above for !consistentNGpusGlobal (which could require a larger valCounts). + + // Sort peers by the GPU count, to simplify data extraction. + memcpy(peersReSorted, rasPeers, nRasPeers * sizeof(*peersReSorted)); + // GPU count is the primary key, host IP is the secondary, and process id is the tertiary. + qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), rasPeersNGpuCompare); + + // Calculate the distribution of different numbers of GPUs per peer. + nValCounts = 0; + for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) { + if (peerIdx == 0 || __builtin_popcountll(peersReSorted[peerIdx].cudaDevs) != + __builtin_popcountll(peersReSorted[peerIdx-1].cudaDevs)) { + valCounts[nValCounts].value = __builtin_popcountll(peersReSorted[peerIdx].cudaDevs); + valCounts[nValCounts].count = 1; + valCounts[nValCounts].firstIdx = peerIdx; + nValCounts++; + } else { + valCounts[nValCounts-1].count++; + } + } + // valCounts is currently sorted by value (number of GPUs per peer). Sort it by the count (most frequent + // GPU counts first). + qsort(valCounts, nValCounts, sizeof(*valCounts), rasValCountsCompareRev); + + // Print it out, the most frequent GPU counts first. + rasOutAppend("\n" + " Processes GPUs\n" + " per process\n"); + for (int i = 0; i < nValCounts; i++) { + struct rasValCount* vc = valCounts+i; + rasOutAppend(" %9d %11ld\n", + vc->count, vc->value); + } + } + rasOutAppend("\n" + " Nodes Processes GPUs\n" + "(total) (total) (total)\n" + "%7d" " %9d" " %11d\n", + totalNodes, nRasPeers, totalGpus); + + if (consistentNGpusNode && consistentNGpusGlobal) { + // In this simpler case, also print the node outliers. + for (int i = 1; i < nValCounts; i++) { + struct rasValCount* vc = valCounts+i; + // We assume that the most frequent group is correct; for the remaining ones, we try to provide more info, + // provided that they meet our definition of an outlier. + if (rasCountIsOutlier(vc->count, client->verbose, totalNodes)) { + rasOutAppend("\nThe outlier node%s:\n", (vc->count > 1 ? "s" : "")); + // peersReSorted is sorted by the node IP address (not port!) as the secondary key and the pid as + // the tertiary, which comes in handy when printing... + for (int peerIdx = vc->firstIdx; peerIdx < vc->count*vc->value + vc->firstIdx; peerIdx += vc->value) { + lineBuf[0] = '\0'; + for (int j = 0; j < vc->value; j++) { + snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d", + (j > 0 ? "," : ""), peersReSorted[j].pid); + } + rasOutAppend(" Node %s running process%s %s\n", + ncclSocketToHost(&peersReSorted[peerIdx].addr, rasLine, sizeof(rasLine)), + (vc->value > 1 ? "es" : ""), lineBuf); + } // for (peerIdx) + } // if (rasCountIsOutlier(vc->count)) + } // for (i) + } // !consistentNPeersGlobal + } // !consistentNGpusNode || !consistentNGpusGlobal || !consistentNPeersGlobal + +#if 0 // Commented out for now to focus the summary status report on the information most relevant to the users. + // To be revisited with future extensions to RAS. + rasOutAppend("\nGathering data about the RAS network (timeout %lds)...", client->timeout / CLOCK_UNITS_PER_SEC); + msgLen = rasOutLength(); + NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail); + rasOutExtract(msg); + rasClientEnqueueMsg(client, msg, msgLen); + msg = nullptr; + { + struct rasCollRequest collReq; + bool allDone = false; + rasCollReqInit(&collReq); + collReq.timeout = client->timeout; + collReq.type = RAS_COLL_CONNS; + NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_CONNS), &allDone, &client->collIdx), + ret, fail); + if (!allDone) + ret = ncclInProgress; // We need to wait for async. responses. + } +#endif + rasOutAppend("\nCommunicators..."); + msgLen = rasOutLength(); + NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail); + rasOutExtract(msg); + rasClientEnqueueMsg(client, msg, msgLen); + msg = nullptr; + { + struct rasCollRequest collReq; + bool allDone = false; + rasCollReqInit(&collReq); + collReq.timeout = client->timeout; + collReq.type = RAS_COLL_COMMS; + NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_COMMS), &allDone, &client->collIdx), + ret, fail); + if (!allDone) + ret = ncclInProgress; + } +exit: + free(peersReSorted); + return ret; +fail: + goto exit; +} + +#if 0 // Commented out for now to focus the summary status report on the information most relevant to the users. + // To be revisited with future extensions to RAS. +// Processes the response from the RAS_COLL_CONNS collective operation and sends the data to the client (for now +// primarily the list of missing processes). Initiates the RAS_COLL_COMMS collective operation. +static ncclResult_t rasClientRunConns(struct rasClient* client) { + ncclResult_t ret = ncclSuccess; + char* msg = nullptr; + int msgLen; + struct rasCollective* coll = rasCollectives+client->collIdx; + struct rasCollConns* connsData = (struct rasCollConns*)coll->data; + int expected; + struct rasPeerInfo* peersBuf = nullptr; + + assert(coll->nFwdSent == coll->nFwdRecv); + client->collIdx = -1; + + rasOutReset(); + rasOutAppend(" obtained a result in %.2fs\n", (clockNano()-coll->startTime)/1e9); + if (coll->nLegTimeouts > 0) { + rasOutAppend(" Warning: encountered %d communication timeout%s while gathering data\n", coll->nLegTimeouts, + (coll->nLegTimeouts > 1 ? "s" : "")); + } + + expected = nRasPeers - nRasDeadPeers; + if (coll->nPeers != expected) { + int missing = expected - coll->nPeers; + rasOutAppend(" Warning: missing data from %d process%s (received from %d, expected %d)\n", + missing, (missing > 1 ? "es" : ""), coll->nPeers, expected); + if (missing <= RAS_CLIENT_DETAIL_THRESHOLD) { + // Extract a list of missing peers. We don't want to print it right away because it would be sorted + // by address (including port, which isn't meaningful to end users). + int nPeersBuf = 0; + NCCLCHECKGOTO(ncclCalloc(&peersBuf, missing), ret, fail); + // Ensure both arrays are sorted (rasPeers already is, by addr); makes finding missing records a breeze. + qsort(coll->peers, coll->nPeers, sizeof(*coll->peers), &ncclSocketsCompare); + for (int rasPeerIdx = 0, collPeerIdx = 0; rasPeerIdx < nRasPeers || collPeerIdx < coll->nPeers;) { + int cmp; + if (rasPeerIdx < nRasPeers && collPeerIdx < coll->nPeers) + cmp = ncclSocketsCompare(&rasPeers[rasPeerIdx].addr, coll->peers+collPeerIdx); + else + cmp = (rasPeerIdx < nRasPeers ? -1 : 1); + + if (cmp == 0) { + rasPeerIdx++; + collPeerIdx++; + } else if (cmp < 0) { + memcpy(peersBuf+(nPeersBuf++), rasPeers+rasPeerIdx, sizeof(*peersBuf)); + rasPeerIdx++; + } else { // cmp > 0 + // Process not found in rasPeers -- shouldn't happen. + collPeerIdx++; + } // cmp > 0 + } // for (rasPeerIdx, collPeerIdx) + + // Sort the output by host and pid, not host and port. + qsort(peersBuf, nPeersBuf, sizeof(*peersBuf), rasPeersHostPidCompare); + rasOutAppend(" The missing process%s:\n", (missing > 1 ? "es" : "")); + for (int peerIdx = 0; peerIdx < nPeersBuf; peerIdx++) { + rasOutAppend(" Process %d on node %s managing GPU%s %s\n", peersBuf[peerIdx].pid, + ncclSocketToHost(&peersBuf[peerIdx].addr, rasLine, sizeof(rasLine)), + (__builtin_popcountll(peersBuf[peerIdx].cudaDevs) > 1 ? "s" : ""), + rasGpuDevsToString(peersBuf[peerIdx].cudaDevs, peersBuf[peerIdx].nvmlDevs, lineBuf, + sizeof(lineBuf))); + } + if (nPeersBuf != missing) + rasOutAppend(" [could not find information on %d process%s]\n", + missing-nPeersBuf, (missing-nPeersBuf > 1 ? "es" : "")); + } // if (expected - coll->nPeers <= RAS_CLIENT_DETAIL_THRESHOLD) + } // if (coll->nPeers != expected) + + if (connsData->nConns > 0) { + rasOutAppend(" Collected data about %d unidirectional connection%s\n", + connsData->nConns, (connsData->nConns > 1 ? "s" : "")); + rasOutAppend(" Travel times (valid only if system clocks are synchronized between nodes):\n" + " Minimum %fs, maximum %fs, average %fs\n", + connsData->travelTimeMin/1e9, connsData->travelTimeMax/1e9, + connsData->travelTimeSum/(1e9*connsData->travelTimeCount)); + } else { + rasOutAppend(" No connection data collected!\n"); + } + if (connsData->nNegativeMins > 0) { + rasOutAppend(" Warning: negative travel times were observed across %d connection%s,\n" + " indicating that the system clocks are *not* synchronized.\n" + " Ordering of events based on local timestamps should be considered unreliable\n", + connsData->nNegativeMins, (connsData->nNegativeMins > 1 ? "s" : "")); + if (connsData->nNegativeMins <= RAS_CLIENT_DETAIL_THRESHOLD) { + rasOutAppend(" The affected connection%s:\n", (connsData->nNegativeMins > 1 ? "s" : "")); + for (int i = 0; i < connsData->nNegativeMins; i++) { + struct rasCollConns::negativeMin* negativeMin = connsData->negativeMins+i; + int sourcePeerIdx = rasPeerFind(&negativeMin->source); + int destPeerIdx = rasPeerFind(&negativeMin->dest); + if (sourcePeerIdx != -1 && destPeerIdx != -1) + rasOutAppend(" From node %s process %d to node %s process %d: observed travel time of %fs\n", + ncclSocketToHost(&negativeMin->source, rasLine, sizeof(rasLine)), rasPeers[sourcePeerIdx].pid, + ncclSocketToHost(&negativeMin->dest, lineBuf, sizeof(lineBuf)), rasPeers[destPeerIdx].pid, + negativeMin->travelTimeMin/1e9); + } + } + } + rasCollFree(coll); + + rasOutAppend("\nGathering data about the NCCL communicators (timeout %lds)...", + client->timeout / CLOCK_UNITS_PER_SEC); + msgLen = rasOutLength(); + NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail); + rasOutExtract(msg); + rasClientEnqueueMsg(client, msg, msgLen); + msg = nullptr; + { + struct rasCollRequest collReq; + bool allDone = false; + rasCollReqInit(&collReq); + collReq.timeout = client->timeout; + collReq.type = RAS_COLL_COMMS; + NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_COMMS), &allDone, &client->collIdx), + ret, fail); + if (!allDone) + ret = ncclInProgress; + } +exit: + free(peersBuf); + return ret; +fail: + goto exit; +} +#endif + +// Processes the response from the RAS_COLL_COMMS collective operation and sends the data to the client: +// statistics on the communicators, missing data from ranks, inconsistent collective operation counts, +// initialization and asynchronous errors, and inconsistent initialization/termination status. +static ncclResult_t rasClientRunComms(struct rasClient* client) { + ncclResult_t ret = ncclSuccess; + char* msg = nullptr; + int msgLen; + struct rasCollective* coll = rasCollectives+client->collIdx; + struct rasCollComms* commsData = (struct rasCollComms*)coll->data; + struct rasCollComms::comm* comm; + struct rasCollComms::comm::rank* ranksReSorted = nullptr; + struct rasValCount* valCounts = nullptr; + int nValCounts; + struct rasValCount* collOpCounts = nullptr; + struct rasAuxComm* auxComms = nullptr; + int maxCommSize; + int* peerIdxConv = nullptr; + int vcIdx; + int nPeersMissing; + uint64_t* peerNvmlDevs = nullptr; + const char*const statusStr[] = { "UNKNOWN", "INIT", "RUNNING", "FINALIZE", "ABORT" }; + const char*const errorStr[] = { + // Listing them all like this, while a bit of a hassle, is less effort than formatting in a temporary buffer. + "OK", + "MISMATCH", + "ERROR", + "ERROR,MISMATCH", + "INCOMPLETE", + "INCOMPLETE,MISMATCH", + "INCOMPLETE,ERROR", + "INCOMPLETE,ERROR,MISMATCH" + }; + + assert(coll->nFwdSent == coll->nFwdRecv); + client->collIdx = -1; + + rasOutReset(); + rasOutAppend(" (%.2fs)\n=============\n\n", (clockNano()-coll->startTime)/1e9); + + // Calculate the number of missing peers early as we rely on it for other things. + nPeersMissing = nRasPeers - nRasDeadPeers - coll->nPeers; + + // Sort the communicators by size. As the structure is inconvenient to move around due to the elements being + // of variable length, we create an auxiliary array that includes pointers to individual elements and simply sort + // that array while keeping the data intact. + NCCLCHECKGOTO(ncclCalloc(&auxComms, commsData->nComms), ret, fail); + // While initializing the just allocated array, also find out the size of the largest communicator so that we know + // how much memory to allocate for another temporary array. + maxCommSize = 0; + comm = commsData->comms; + for (int commIdx = 0; commIdx < commsData->nComms; commIdx++) { + if (maxCommSize < comm->commNRanks) + maxCommSize = comm->commNRanks; + auxComms[commIdx].comm = comm; + comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks)); + } + NCCLCHECKGOTO(ncclCalloc(&ranksReSorted, maxCommSize), ret, fail); + + // For convenience, create a translation table from rasCollective's peerIdx to rasPeers peerIdx. + NCCLCHECKGOTO(ncclCalloc(&peerIdxConv, coll->nPeers), ret, fail); + for (int peerIdx = 0; peerIdx < coll->nPeers; peerIdx++) + peerIdxConv[peerIdx] = rasPeerFind(coll->peers+peerIdx); + // Sort coll->peers to match the ordering of rasPeers -- we may need it later... + qsort(coll->peers, coll->nPeers, sizeof(*coll->peers), &ncclSocketsCompare); + + // Fill in the remaining fields of auxComm's. + for (int commIdx = 0; commIdx < commsData->nComms; commIdx++) { + struct rasAuxComm* auxComm = auxComms+commIdx; + int nRanks = 0; + comm = auxComm->comm; + + if (comm->commNRanks > comm->nRanks) { + // There are two possibilities here. Either we are missing the data on some ranks because the processes are + // unreachable, or the processes _are_ reachable but didn't report to be part of this communicator (which + // could definitely happen if some processes have already called ncclCommDestroy or ncclCommAbort). Because we + // currently don't collect data about missing ranks, we can't reliably distinguish these two cases. + // For now we rely on an approximation: if we _know_ that some peers failed to respond, we mark this + // as an INCOMPLETE error; otherwise as a MISMATCH warning. + if (nPeersMissing > 0 || nRasDeadPeers > 0) + auxComm->errors |= RAS_ACE_INCOMPLETE; + else { + auxComm->errors |= RAS_ACE_MISMATCH; + auxComm->status |= RAS_ACS_UNKNOWN; + } + } + + memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted)); + // Convert ranksReSorted' peerIdx to rasPeers and sort by it -- that way we will have the ranks sorted + // by process _and_ node, which makes counting easy. + for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) + ranksReSorted[rankIdx].peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx]; + qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksPeerCompare); + + // Count the peers and nodes, get the status/error indicators. + for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { + struct rasCollComms::comm::rank* rank = ranksReSorted+rankIdx; + if (rankIdx == 0) { + auxComm->nPeers = auxComm->nNodes = 1; + auxComm->ranksPerNodeMin = NCCL_MAX_LOCAL_RANKS; + auxComm->ranksPerNodeMax = 0; + auxComm->firstCollOpCount = rank->collOpCount; + nRanks = 1; + } else { // rankIdx > 0 + if (rank->peerIdx != rank[-1].peerIdx) { + auxComm->nPeers++; + if (!ncclSocketsSameNode(&rasPeers[rank->peerIdx].addr, &rasPeers[rank[-1].peerIdx].addr)) { + auxComm->nNodes++; + if (auxComm->ranksPerNodeMin > nRanks) + auxComm->ranksPerNodeMin = nRanks; + if (auxComm->ranksPerNodeMax < nRanks) + auxComm->ranksPerNodeMax = nRanks; + nRanks = 0; + } + } // if (rank->peerIdx != rank[-1].peerIdx) + nRanks++; + } // rankIdx > 0 + if (rankIdx == comm->nRanks-1) { + // Last iteration of the loop. + if (auxComm->ranksPerNodeMin > nRanks) + auxComm->ranksPerNodeMin = nRanks; + if (auxComm->ranksPerNodeMax < nRanks) + auxComm->ranksPerNodeMax = nRanks; + } + + if (rank->status.abortFlag) + auxComm->status |= RAS_ACS_ABORT; + else if (rank->status.finalizeCalled || rank->status.destroyFlag) { + // destroyFlag is set by ncclCommDestroy and ncclCommAbort. finalizeCalled appears to be set by + // ncclCommFinalize only. According to the docs, ncclCommDestroy *can* be called without calling + // ncclCommFinalize first. The code structure here ensures that we attribute destroyFlag properly + // as a finalize state indicator (and ignore it in case of ncclCommAbort). + auxComm->status |= RAS_ACS_FINALIZE; + } + else if (rank->status.initState == ncclSuccess) + auxComm->status |= RAS_ACS_RUNNING; + else // rank->initState != ncclSuccess + auxComm->status |= RAS_ACS_INIT; + + if (rank->collOpCount != auxComm->firstCollOpCount) + auxComm->errors |= RAS_ACE_MISMATCH; + if (rank->status.initState != ncclSuccess && rank->status.initState != ncclInProgress) + auxComm->errors |= RAS_ACE_ERROR; + if (rank->status.asyncError != ncclSuccess && rank->status.asyncError != ncclInProgress) + auxComm->errors |= RAS_ACE_ERROR; + } // for (rankIdx) + + if (__builtin_popcount(auxComm->status) > 1) { + // We've got a status mismatch between ranks. + auxComm->errors |= RAS_ACE_MISMATCH; + } + } // for (commIdx) + // Sort it by size/nNodes/status/errors/missing ranks. + qsort(auxComms, commsData->nComms, sizeof(*auxComms), &rasAuxCommsCompareRev); + + // Calculate the distribution of different communicator sizes. + NCCLCHECKGOTO(ncclCalloc(&valCounts, commsData->nComms), ret, fail); + nValCounts = 0; + for (int commIdx = 0; commIdx < commsData->nComms; commIdx++) { + if (commIdx == 0 || + auxComms[commIdx].comm->commNRanks != auxComms[commIdx-1].comm->commNRanks || + auxComms[commIdx].nNodes != auxComms[commIdx-1].nNodes || + // __builtin_clz returns the number of leading 0-bits, which is a proxy for the index of the highest 1-bit. + __builtin_clz(auxComms[commIdx].status) != __builtin_clz(auxComms[commIdx-1].status) || + auxComms[commIdx].errors != auxComms[commIdx-1].errors) { + valCounts[nValCounts].value = 0; // We have many distinguishing values but only one field to store them. + // It doesn't really matter, given that we can extract them via firstIdx. + valCounts[nValCounts].count = 1; + valCounts[nValCounts].firstIdx = commIdx; + nValCounts++; + } else { + valCounts[nValCounts-1].count++; + } + } + + rasOutAppend("Group Comms Nodes Ranks Ranks Ranks Status Errors\n" + " # in group per comm per node per comm in group\n"); + if (commsData->nComms == 0) + rasOutAppend("No communicator data collected!\n"); + + // Allocate an auxiliary structure used for counting the number of ranks (unique GPUs) in a group. + NCCLCHECKGOTO(ncclCalloc(&peerNvmlDevs, coll->nPeers), ret, fail); + + // Print it out, the largest communicators first. + for (int vcIdx = 0; vcIdx < nValCounts; vcIdx++) { + struct rasValCount* vc = valCounts+vcIdx; + struct rasAuxComm* auxComm = auxComms+vc->firstIdx; + int ranksPerNodeMin, ranksPerNodeMax; + int ranksTotal; + + ranksPerNodeMin = NCCL_MAX_LOCAL_RANKS; + ranksPerNodeMax = 0; + memset(peerNvmlDevs, '\0', coll->nPeers * sizeof(*peerNvmlDevs)); + // We don't group comms by ranksPerNodeMin/Max, so the values may differ between comms in one group. + // Calculate the group's min/max. + // Also calculate the number of unique ranks in the group. + for (int commIdx = 0; commIdx < vc->count; commIdx++) { + if (ranksPerNodeMin > auxComm[commIdx].ranksPerNodeMin) + ranksPerNodeMin = auxComm[commIdx].ranksPerNodeMin; + if (ranksPerNodeMax < auxComm[commIdx].ranksPerNodeMax) + ranksPerNodeMax = auxComm[commIdx].ranksPerNodeMax; + for (int rankIdx = 0; rankIdx < auxComm[commIdx].comm->nRanks; rankIdx++) { + struct rasCollComms::comm::rank* rank = auxComm[commIdx].comm->ranks+rankIdx; + peerNvmlDevs[rank->peerIdx] |= (1UL << rank->nvmlDev); + } + } + ranksTotal = 0; + for (int peerIdx = 0; peerIdx < coll->nPeers; peerIdx++) + ranksTotal += __builtin_popcountll(peerNvmlDevs[peerIdx]); + if (ranksPerNodeMin == ranksPerNodeMax) + snprintf(rasLine, sizeof(rasLine), "%d", ranksPerNodeMin); + else + snprintf(rasLine, sizeof(rasLine), "%d-%d", ranksPerNodeMin, ranksPerNodeMax); + rasOutAppend("%5d %8d %8d %8s %8d %8d %8s %6s\n", + vcIdx, vc->count, auxComm->nNodes, rasLine, auxComm->comm->commNRanks, ranksTotal, + // __builtin_clz returns the number of leading 0-bits. This makes it possible to translate the + // status (which is a bitmask) into an array index. + statusStr[(sizeof(unsigned int)*8-1)-__builtin_clz(auxComm->status)], errorStr[auxComm->errors]); + } + + rasOutAppend("\nErrors\n" + "======\n\n"); + + if (nPeersMissing > 0) { + rasOutAppend("INCOMPLETE\n" + " Missing communicator data from %d job process%s\n", nPeersMissing, (nPeersMissing > 1 ? "es" : "")); + if (rasCountIsOutlier(nPeersMissing, client->verbose)) { + // Extract a list of missing peers. We don't want to print it right away because it would be sorted + // by address (including port, which isn't meaningful to end users). + struct rasPeerInfo* peersBuf = nullptr; + int nPeersBuf; + + // Both rasPeers and coll->peers are sorted by address (the latter we sorted above) which makes comparing + // them much easier. + NCCLCHECKGOTO(ncclCalloc(&peersBuf, nPeersMissing), ret, fail); + nPeersBuf = 0; + for (int rasPeerIdx = 0, collPeerIdx = 0; rasPeerIdx < nRasPeers || collPeerIdx < coll->nPeers;) { + int cmp; + if (rasPeerIdx < nRasPeers && collPeerIdx < coll->nPeers) + cmp = ncclSocketsCompare(&rasPeers[rasPeerIdx].addr, coll->peers+collPeerIdx); + else + cmp = (rasPeerIdx < nRasPeers ? -1 : 1); + + if (cmp == 0) { + rasPeerIdx++; + collPeerIdx++; + } else if (cmp < 0) { + // Process missing from coll->peers. Don't report dead ones though, as they are not included + // in nPeersMissing and are reported separately below. + if (!rasPeerIsDead(&rasPeers[rasPeerIdx].addr)) { + assert(nPeersBuf < nPeersMissing); + memcpy(peersBuf+(nPeersBuf++), rasPeers+rasPeerIdx, sizeof(*peersBuf)); + } + rasPeerIdx++; + } else { // cmp > 0 + // Process not found in rasPeers -- shouldn't happen, unless during a race? + collPeerIdx++; + } // cmp > 0 + } // for (rasPeerIdx, collPeerIdx) + + // Sort the output by host and pid. + qsort(peersBuf, nPeersBuf, sizeof(*peersBuf), rasPeersHostPidCompare); + for (int peerIdx = 0; peerIdx < nPeersBuf; peerIdx++) { + rasOutAppend(" Process %d on node %s managing GPU%s %s\n", peersBuf[peerIdx].pid, + ncclSocketToHost(&peersBuf[peerIdx].addr, rasLine, sizeof(rasLine)), + (__builtin_popcountll(peersBuf[peerIdx].cudaDevs) > 1 ? "s" : ""), + rasGpuDevsToString(peersBuf[peerIdx].cudaDevs, peersBuf[peerIdx].nvmlDevs, lineBuf, + sizeof(lineBuf))); + } + if (nPeersBuf != nPeersMissing) + rasOutAppend(" [could not find information on %d process%s]\n", + nPeersMissing-nPeersBuf, (nPeersMissing-nPeersBuf > 1 ? "es" : "")); + free(peersBuf); + } // if (rasCountIsOutlier(nPeersMissing)) + rasOutAppend("\n"); + } + + if (nRasDeadPeers > 0) { + rasOutAppend("DEAD\n" + " %d job process%s considered dead (unreachable via the RAS network)\n", nRasDeadPeers, + (nRasDeadPeers > 1 ? "es are" : " is")); + if (rasCountIsOutlier(nRasDeadPeers, client->verbose)) { + struct rasPeerInfo* peersReSorted = nullptr; + int nPeersReSorted = 0; + NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasDeadPeers), ret, fail); + for (int i = 0; i < nRasDeadPeers; i++) { + int peerIdx = rasPeerFind(rasDeadPeers+i); + if (peerIdx != -1) + memcpy(peersReSorted+(nPeersReSorted++), rasPeers+peerIdx, sizeof(*peersReSorted)); + } + // Sort the output by host and pid, not host and port. + qsort(peersReSorted, nPeersReSorted, sizeof(*peersReSorted), rasPeersHostPidCompare); + for (int peerIdx = 0; peerIdx < nPeersReSorted; peerIdx++) { + rasOutAppend(" Process %d on node %s managing GPU%s %s\n", peersReSorted[peerIdx].pid, + ncclSocketToHost(&peersReSorted[peerIdx].addr, rasLine, sizeof(rasLine)), + (__builtin_popcountll(peersReSorted[peerIdx].cudaDevs) > 1 ? "s" : ""), + rasGpuDevsToString(peersReSorted[peerIdx].cudaDevs, peersReSorted[peerIdx].nvmlDevs, lineBuf, + sizeof(lineBuf))); + } + if (nPeersReSorted != nRasDeadPeers) + rasOutAppend(" [could not find information on %d process%s]\n", + nRasDeadPeers-nPeersReSorted, (nRasDeadPeers-nPeersReSorted > 1 ? "es" : "")); + free(peersReSorted); + } // if (rasCountIsOutlier(nRasDeadPeers) + rasOutAppend("\n"); + } + + for (vcIdx = 0; vcIdx < nValCounts; vcIdx++) { + struct rasValCount* vc; + vc = valCounts+vcIdx; + for (int commIdx = vc->firstIdx; commIdx < vc->count + vc->firstIdx; commIdx++) { + struct rasAuxComm* auxComm = auxComms+commIdx; + comm = auxComm->comm; + + if (auxComm->errors & RAS_ACE_INCOMPLETE) { + int nRanksMissing = comm->commNRanks - comm->nRanks; + rasOutAppend("#%d-%d (%016lx) INCOMPLETE\n" + " Missing communicator data from %d rank%s\n", vcIdx, commIdx - vc->firstIdx, + comm->commHash, nRanksMissing, (nRanksMissing > 1 ? "s" : "")); + if (rasCountIsOutlier(nRanksMissing, client->verbose)) { + lineBuf[0] = '\0'; + // rankIdx indexes the comm->ranks array; in principle it should be the same as commRank, with the + // exception of the missing ranks... + for (int commRank = 0, rankIdx = 0; commRank < comm->commNRanks; commRank++) { + if (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == commRank) { + rankIdx++; + } else { + snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d", + (rankIdx == commRank ? "" : ","), commRank); + } + } // for (commRank) + rasOutAppend(" The missing rank%s: %s\n", (nRanksMissing > 1 ? "s" : ""), lineBuf); + } // if (rasCountIsOutlier(nRanksMissing)) + rasOutAppend("\n"); + } // if (auxComm->errors & RAS_ACE_INCOMPLETE) + + if (auxComm->errors & RAS_ACE_ERROR) { + int ncclErrors[ncclNumResults]; + int nErrors; + rasOutAppend("#%d-%d (%016lx) ERROR\n", vcIdx, commIdx - vc->firstIdx, comm->commHash); + + memset(ncclErrors, '\0', sizeof(ncclErrors)); + for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) + ncclErrors[comm->ranks[rankIdx].status.initState]++; + nErrors = comm->nRanks - (ncclErrors[ncclSuccess] + ncclErrors[ncclInProgress]); + if (nErrors > 0) { + rasOutAppend(" Initialization error%s on %d rank%s\n", + (nErrors > 1 ? "s" : ""), nErrors, (nErrors > 1 ? "s" : "")); + rasClientBreakDownErrors(client, comm, peerIdxConv, ncclErrors); + } + + memset(ncclErrors, '\0', sizeof(ncclErrors)); + for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) + ncclErrors[comm->ranks[rankIdx].status.asyncError]++; + nErrors = comm->nRanks - (ncclErrors[ncclSuccess] + ncclErrors[ncclInProgress]); + if (nErrors > 0) { + rasOutAppend(" Asynchronous error%s on %d rank%s\n", + (nErrors > 1 ? "s" : ""), nErrors, (nErrors > 1 ? "s" : "")); + rasClientBreakDownErrors(client, comm, peerIdxConv, ncclErrors, /*isAsync*/true); + } + rasOutAppend("\n"); + } // if (auxComm->errors & RAS_ACE_ERROR) + } // for (commIdx) + } // for (vcIdx) + + rasOutAppend("Warnings\n" + "========\n\n"); + + if (coll->nLegTimeouts > 0) { + rasOutAppend("TIMEOUT\n" + " Encountered %d communication timeout%s while gathering communicator data\n\n", + coll->nLegTimeouts, (coll->nLegTimeouts > 1 ? "s" : "")); + } + + for (int vcIdx = 0; vcIdx < nValCounts; vcIdx++) { + struct rasValCount* vc = valCounts+vcIdx; + for (int commIdx = vc->firstIdx; commIdx < vc->count + vc->firstIdx; commIdx++) { + bool inconsistent; + struct rasAuxComm* auxComm = auxComms+commIdx; + comm = auxComm->comm; + + if (auxComm->errors & RAS_ACE_MISMATCH) { + rasOutAppend("#%d-%d (%016lx) MISMATCH\n", vcIdx, commIdx - vc->firstIdx, comm->commHash); + + if (collOpCounts == nullptr) { + // Allocating comm->commNRanks elements ensures that we won't need to reallocate, because the valCounts + // array is reverse-sorted by commNRanks. On the other hand, for this purpose allocating commNRanks + // elements may be massively overpessimistic... + NCCLCHECKGOTO(ncclCalloc(&collOpCounts, comm->commNRanks), ret, fail); + } + + if (__builtin_popcount(auxComm->status) > 1) { + rasOutAppend(" Communicator ranks have different status\n"); + + // We need to sort the ranks by status. However, status is normally calculated from other fields. + // We will copy the ranks and reuse collOpCount to store it. + memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted)); + for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { + struct rasCollComms::comm::rank* rank = ranksReSorted+rankIdx; + + if (rank->status.abortFlag) + rank->collOpCount = RAS_ACS_ABORT; + else if (rank->status.finalizeCalled || rank->status.destroyFlag) + rank->collOpCount = RAS_ACS_FINALIZE; + else if (rank->status.initState == ncclSuccess) + rank->collOpCount = RAS_ACS_RUNNING; + else + rank->collOpCount = RAS_ACS_INIT; + } + qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksCollOpCompare); + // Calculate the frequency of different status values. + int nCollOpCounts = 0; + for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { + if (rankIdx == 0 || ranksReSorted[rankIdx].collOpCount != ranksReSorted[rankIdx-1].collOpCount) { + // __builtin_clz returns the number of leading 0-bits. This makes it possible to translate the + // status (which is a bitmask) into an array index. + collOpCounts[nCollOpCounts].value = (sizeof(unsigned int)*8-1) - __builtin_clz(ranksReSorted[rankIdx].collOpCount); + collOpCounts[nCollOpCounts].count = 1; + collOpCounts[nCollOpCounts].firstIdx = rankIdx; + nCollOpCounts++; + } else { + collOpCounts[nCollOpCounts-1].count++; + } + } + if (comm->nRanks < comm->commNRanks) { + // Add a "fake" element corresponding to the missing entries. The statusStr array contains the "UNKNOWN" + // string at index 0. + collOpCounts[nCollOpCounts].value = 0; + collOpCounts[nCollOpCounts].count = comm->commNRanks - comm->nRanks; + collOpCounts[nCollOpCounts].firstIdx = -1; // "Fake" entry identifier. + nCollOpCounts++; + } + // Sort by that frequency (most frequent first). + qsort(collOpCounts, nCollOpCounts, sizeof(*collOpCounts), rasValCountsCompareRev); + + for (int coc = 0; coc < nCollOpCounts; coc++) { + struct rasValCount* vcc = collOpCounts+coc; + if (vcc->count > 1) + rasOutAppend(" %d ranks have status %s\n", vcc->count, statusStr[vcc->value]); + if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) { + if (vcc->firstIdx != -1) { + // ranksReSorted is sorted by rank as the secondary key, which comes in handy when printing... + for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) { + int peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx]; + if (peerIdx != -1) { + if (vcc->count > 1) + rasOutAppend(" Rank %d -- GPU %s managed by process %d on node %s\n", + ranksReSorted[rankIdx].commRank, + rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)), + rasPeers[peerIdx].pid, + ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine))); + else + rasOutAppend(" Rank %d has status %s -- GPU %s managed by process %d on node %s\n", + ranksReSorted[rankIdx].commRank, statusStr[vcc->value], + rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)), + rasPeers[peerIdx].pid, + ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine))); + } else { // peerIdx == -1 + if (vcc->count > 1) + rasOutAppend(" Rank %d -- [process information not found]\n", ranksReSorted[rankIdx].commRank); + else + rasOutAppend(" Rank %d has status %s -- [process information not found]\n", + ranksReSorted[rankIdx].commRank, statusStr[vcc->value]); + } // peerIdx == -1 + } // for (rankIdx) + } else { + // UNKNOWN ranks. Format a string with their rank numbers (we don't know anything more). + lineBuf[0] = '\0'; + // rankIdx indexes the comm->ranks array; in principle it should be the same as commRank, with the + // exception of the missing ranks... + for (int commRank = 0, rankIdx = 0; commRank < comm->commNRanks; commRank++) { + if (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == commRank) { + rankIdx++; + } else { + snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d", + (rankIdx == commRank ? "" : ","), commRank); + } + } // for (commRank) + if (vcc->count > 1) { + rasOutAppend(" The unknown ranks: %s\n", lineBuf); + } else { + rasOutAppend(" Rank %s has status %s\n", lineBuf, statusStr[vcc->value]); + } + } + } // if (rasCountIsOutlier(vcc->count)) + } // for (coc) + } // if (__builtin_popcount(auxComm->status) > 1) + + inconsistent = false; + for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { + if (comm->ranks[rankIdx].collOpCount != auxComm->firstCollOpCount) { + inconsistent = true; + break; + } + } + if (inconsistent) { + rasOutAppend(" Communicator ranks have different collective operation counts\n"); + + // Sort the ranks by collOpCount and rank for easy counting. + memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted)); + qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksCollOpCompare); + // Calculate the frequency of different collOpCount values. + int nCollOpCounts = 0; + for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { + if (rankIdx == 0 || ranksReSorted[rankIdx].collOpCount != ranksReSorted[rankIdx-1].collOpCount) { + collOpCounts[nCollOpCounts].value = ranksReSorted[rankIdx].collOpCount; + collOpCounts[nCollOpCounts].count = 1; + collOpCounts[nCollOpCounts].firstIdx = rankIdx; + nCollOpCounts++; + } else { + collOpCounts[nCollOpCounts-1].count++; + } + } + // Sort by that frequency (most frequent first). + qsort(collOpCounts, nCollOpCounts, sizeof(*collOpCounts), rasValCountsCompareRev); + + for (int coc = 0; coc < nCollOpCounts; coc++) { + struct rasValCount* vcc = collOpCounts+coc; + if (vcc->count > 1) + rasOutAppend(" %d ranks have launched up to operation %ld\n", vcc->count, vcc->value); + if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) { + // ranksReSorted is sorted by rank as the secondary key, which comes in handy when printing... + for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) { + int peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx]; + if (peerIdx != -1) { + if (vcc->count > 1) + rasOutAppend(" Rank %d -- GPU %s managed by process %d on node %s\n", + ranksReSorted[rankIdx].commRank, + rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)), + rasPeers[peerIdx].pid, + ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine))); + else + rasOutAppend(" Rank %d has launched up to operation %ld -- GPU %s managed by process %d on node %s\n", + ranksReSorted[rankIdx].commRank, vcc->value, + rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)), + rasPeers[peerIdx].pid, + ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine))); + } else { // peerIdx == -1 + if (vcc->count > 1) + rasOutAppend(" Rank %d -- [process information not found]\n", ranksReSorted[rankIdx].commRank); + else + rasOutAppend(" Rank %d has launched up to operation %ld -- [process information not found]\n", + ranksReSorted[rankIdx].commRank, vcc->value); + } // peerIdx == -1 + } // for (rankIdx) + } // if (rasCountIsOutlier(vcc->count)) + } // for (coc) + } // if (inconsistent) + rasOutAppend("\n"); + } // if (auxComm->errors & RAS_ACE_MISMATCH) + } // for (commIdx) + } // for (vcIdx) + rasCollFree(coll); + + msgLen = rasOutLength(); + NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail); + rasOutExtract(msg); + rasClientEnqueueMsg(client, msg, msgLen); + msg = nullptr; +exit: + free(peerNvmlDevs); + free(collOpCounts); + free(valCounts); + free(peerIdxConv); + free(ranksReSorted); + free(auxComms); + return ret; +fail: + goto exit; +} + +static void rasClientBreakDownErrors(struct rasClient* client, struct rasCollComms::comm* comm, + const int* peerIdxConv, int ncclErrors[ncclNumResults], bool isAsync) { + for (;;) { + int maxCount = 0; + ncclResult_t maxCountIdx = ncclSuccess; + for (int i = ncclUnhandledCudaError; i < ncclInProgress; i++) { + if (maxCount < ncclErrors[i]) { + maxCount = ncclErrors[i]; + maxCountIdx = (ncclResult_t)i; + } + } // for (i) + if (maxCountIdx == ncclSuccess) + break; + if (maxCount > 1) + rasOutAppend(" %d ranks reported %s\n", maxCount, ncclErrorToString(maxCountIdx)); + if (rasCountIsOutlier(maxCount, client->verbose)) { + for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { + if ((isAsync ? comm->ranks[rankIdx].status.asyncError : comm->ranks[rankIdx].status.initState) == maxCountIdx) { + int peerIdx = peerIdxConv[comm->ranks[rankIdx].peerIdx]; + if (peerIdx != -1) { + if (maxCount > 1) + rasOutAppend(" Rank %d -- GPU %s managed by process %d on node %s\n", + comm->ranks[rankIdx].commRank, + rasCommRankGpuToString(comm->ranks+rankIdx, lineBuf, sizeof(lineBuf)), + rasPeers[peerIdx].pid, + ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine))); + else + rasOutAppend(" Rank %d reported %s -- GPU %s managed by process %d on node %s\n", + comm->ranks[rankIdx].commRank, ncclErrorToString(maxCountIdx), + rasCommRankGpuToString(comm->ranks+rankIdx, lineBuf, sizeof(lineBuf)), + rasPeers[peerIdx].pid, + ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine))); + } else { // peerIdx == -1 + if (maxCount > 1) + rasOutAppend(" Rank %d -- [process information not found]\n", comm->ranks[rankIdx].commRank); + else + rasOutAppend(" Rank %d reported %s -- [process information not found]\n", + comm->ranks[rankIdx].commRank, ncclErrorToString(maxCountIdx)); + } // peerIdx == -1 + } // if rank's error matches + } // for (rankIdx) + } // if (rasCountIsOutlier(maxCount)) + ncclErrors[maxCountIdx] = 0; + } // for (;;) +} + + +////////////////////////////////////////////////////////////////////// +// Functions related to the handling of the internal output buffer. // +////////////////////////////////////////////////////////////////////// + +// Appends a printf-formatted string to the output buffer. +// Unlike with INFO or WARN messages, the caller should terminate lines with '\n' as appropriate. +static void rasOutAppend(const char* format, ...) { + ncclResult_t ret; // Ignored. + va_list vargs; + int needed; + va_start(vargs, format); + needed = vsnprintf(rasOutBuffer+nRasOutBuffer, rasOutBufferSize-nRasOutBuffer, format, vargs); + va_end(vargs); + + if (needed < 0) // Output error (whatever that might be...) + return; + + // The +1 below accounts for the terminating '\0'. + if (needed + 1 > rasOutBufferSize-nRasOutBuffer) { + int newBufferSize = ROUNDUP(nRasOutBuffer+needed+1, RAS_OUT_INCREMENT); + NCCLCHECKGOTO(ncclRealloc(&rasOutBuffer, rasOutBufferSize, newBufferSize), ret, exit); + rasOutBufferSize = newBufferSize; + + va_start(vargs, format); + needed = vsnprintf(rasOutBuffer+nRasOutBuffer, rasOutBufferSize-nRasOutBuffer, format, vargs); + va_end(vargs); + + if (needed < 0) // Output error (whatever that might be...) + return; + } + + nRasOutBuffer += needed; + assert(nRasOutBuffer <= rasOutBufferSize); +exit: + ; +} + +// Copies the output data from an internal buffer to a user-supplied one, including the terminating '\0'. +// The user buffer must already be allocated and be at least rasOutLength() bytes long (which includes +// the terminating '\0'). +static void rasOutExtract(char* buffer) { + if (rasOutBuffer) + memcpy(buffer, rasOutBuffer, rasOutLength()); +} + +// Returns the current length of the used portion of the output buffer, *not* including the terminating '\0'. +static int rasOutLength() { + return nRasOutBuffer; +} + +// Resets the output buffer position to the beginning (effectively clearing the buffer). +static void rasOutReset() { + ncclResult_t ret; // Ignored. + nRasOutBuffer = 0; + if (rasOutBuffer == nullptr) { + NCCLCHECKGOTO(ncclCalloc(&rasOutBuffer, RAS_OUT_INCREMENT), ret, exit); + rasOutBufferSize = RAS_OUT_INCREMENT; + } +exit: + ; +} + + +/////////////////////////////////////////////////////////////////// +// Various sorting callbacks used when grouping/formatting data. // +/////////////////////////////////////////////////////////////////// + +// Sorting callback for rasPeerInfo elements. Sorts by the number of bits set in cudaDevs. Uses the host IP as the +// secondary key and the process id as the tertiary key. +static int rasPeersNGpuCompare(const void* e1, const void* e2) { + const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1; + const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2; + int c1 = __builtin_popcountll(p1->cudaDevs); + int c2 = __builtin_popcountll(p2->cudaDevs); + + if (c1 == c2) { + // Host IP address is the secondary key. + int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr); + if (cmp == 0) { + // Process ID is the tertiary key. + cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0)); + } + return cmp; + } else { + return (c1 < c2 ? -1 : 1); + } +} + +// Sorting callback for rasPeerInfo elements. Sorts by the number of peers per node, which we store in cudaDevs. +// Uses the host IP as the secondary key and the process id as the tertiary key. +static int rasPeersNProcsCompare(const void* e1, const void* e2) { + const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1; + const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2; + + if (p1->cudaDevs == p2->cudaDevs) { + // Host IP address is the secondary key. + int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr); + if (cmp == 0) { + // Process ID is the tertiary key. + cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0)); + } + return cmp; + } else { + return (p1->cudaDevs < p2->cudaDevs ? -1 : 1); + } +} + +// Sorting callback for rasPeerInfo elements. Sorts by the host IP and the process id as the secondary key (rather +// than the port). +static int rasPeersHostPidCompare(const void* e1, const void* e2) { + const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1; + const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2; + + int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr); + if (cmp == 0) { + // Process ID is the secondary key. + cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0)); + } + return cmp; +} + +// Sorting callback for ncclSocketAddress. Unlike the ncclSocketsCompare, it ignores the port. +static int ncclSocketsHostCompare(const void* p1, const void* p2) { + const union ncclSocketAddress* a1 = (const union ncclSocketAddress*)p1; + const union ncclSocketAddress* a2 = (const union ncclSocketAddress*)p2; + // AF_INET (2) is less than AF_INET6 (10). + int family = a1->sa.sa_family; + if (family != a2->sa.sa_family) { + if (family > 0 && a2->sa.sa_family > 0) + return (family < a2->sa.sa_family ? -1 : 1); + else // Put empty addresses at the end (not that it matters...). + return (family > 0 ? -1 : 1); + } + + int cmp; + if (family == AF_INET) { + cmp = memcmp(&a1->sin.sin_addr, &a2->sin.sin_addr, sizeof(a1->sin.sin_addr)); + } + else if (family == AF_INET6) { + cmp = memcmp(&a1->sin6.sin6_addr, &a2->sin6.sin6_addr, sizeof(a1->sin6.sin6_addr)); + } else { + // The only remaining valid case are empty addresses. + assert(family == 0); + cmp = 0; // Two empty addresses are equal... + } + + return cmp; +} + +// Sorting callback for rasValCount elements. Sorts by the count, largest first. Value is the secondary key. +static int rasValCountsCompareRev(const void* p1, const void* p2) { + const struct rasValCount* r1 = (const struct rasValCount*)p1; + const struct rasValCount* r2 = (const struct rasValCount*)p2; + + if (r1->count == r2->count) { + return (r1->value > r2->value ? -1 : (r1->value < r2->value ? 1: 0)); + } else { + return (r1->count > r2->count ? -1 : 1); + } +} + +// Sorting callback for rasAuxComm elements. +// Sorts the comms by the rank count (commNRanks), nNodes as secondary key, status as the tertiary, and errors as +// the quaternary. Sorts in reverse (largest first). +// The final key is the comm's nRanks, sorted in reverse to the other keys, so comms with the largest number +// of ranks *missing* will be first. +static int rasAuxCommsCompareRev(const void* p1, const void* p2) { + const struct rasAuxComm* c1 = (const struct rasAuxComm*)p1; + const struct rasAuxComm* c2 = (const struct rasAuxComm*)p2; + + if (c1->comm->commNRanks == c2->comm->commNRanks) { + if (c1->nNodes == c2->nNodes) { + // We don't want to compare the status values directly because they could be bitmasks and we are only + // interested in the highest bit set. + // __builtin_clz returns the number of leading 0-bits, so in our case the value will be the *smallest* + // if RAS_ACS_ABORT (8) is set and the *largest* if only RAS_ACS_INIT (1) is set, so we reverse the + // comparison to get the desired sorting order. + int s1 = __builtin_clz(c1->status); + int s2 = __builtin_clz(c2->status); + if (s1 == s2) { + if (c1->errors == c2->errors) { + if (c1->comm->nRanks == c2->comm->nRanks) { + return 0; + } else { + return (c1->comm->nRanks < c2->comm->nRanks ? -1 : 1); + } + } else { + return (c1->errors > c2->errors ? -1 : 1); + } + } else { + return (s1 < s2 ? -1 : 1); + } + } else { + return (c1->nNodes > c2->nNodes ? -1 : 1); + } + } else { + return (c1->comm->commNRanks > c2->comm->commNRanks ? -1 : 1); + } +} + +// Sorting callback for rasCollComms::comm::rank elements. Sorts by the peerIdx. +static int rasCommRanksPeerCompare(const void* p1, const void* p2) { + const struct rasCollComms::comm::rank* r1 = (const struct rasCollComms::comm::rank*)p1; + const struct rasCollComms::comm::rank* r2 = (const struct rasCollComms::comm::rank*)p2; + + return (r1->peerIdx < r2->peerIdx ? -1 : (r1->peerIdx > r2->peerIdx ? 1 : 0)); +} + +// Sorting callback for rasCollComms::comm::rank elements. Sorts by the collOpCount, with rank as the secondary key. +static int rasCommRanksCollOpCompare(const void* p1, const void* p2) { + const struct rasCollComms::comm::rank* r1 = (const struct rasCollComms::comm::rank*)p1; + const struct rasCollComms::comm::rank* r2 = (const struct rasCollComms::comm::rank*)p2; + + if (r1->collOpCount == r2->collOpCount) { + // Use the rank as the secondary key. + return (r1->commRank < r2->commRank ? -1 : (r1->commRank > r2->commRank ? 1 : 0)); + } else { + return (r1->collOpCount < r2->collOpCount ? -1 : 1); + } +} + + +//////////////////////////////////////////////////////////// +// String formatting functions for various types of data. // +//////////////////////////////////////////////////////////// + +// Coverts a GPU mask(s) to a string. If the CUDA mask is different from the NVML mask, both are printed. +const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, size_t size) { + bool first = true; + buf[0] = '\0'; + for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; i++) + if (cudaDevs & (1UL << i)) { + snprintf(buf+strlen(buf), size-strlen(buf), "%s%d", (first ? "" : ","), i); + first = false; + } + if (cudaDevs != nvmlDevs) { + snprintf(buf+strlen(buf), size-strlen(buf), " (NVML "); + first = true; + for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; i++) + if (nvmlDevs & (1UL << i)) { + snprintf(buf+strlen(buf), size-strlen(buf), "%s%d", (first ? "" : ","), i); + first = false; + } + snprintf(buf+strlen(buf), size-strlen(buf), ")"); + } + return buf; +} + +// Formats a GPU string based on the rasCollComms's rank. If the CUDA id is different from the NVML id, both are +// printed. +static const char* rasCommRankGpuToString(const struct rasCollComms::comm::rank* rank, char* buf, size_t size) { + snprintf(buf, size, "%d", rank->cudaDev); + if (rank->cudaDev != rank->nvmlDev) { + snprintf(buf+strlen(buf), size-strlen(buf), " (NVML %d)", rank->nvmlDev); + } + return buf; +} + +// Converts a NCCL error result to a string. +static const char* ncclErrorToString(ncclResult_t err) { + switch (err) { + case ncclUnhandledCudaError : return "Unhandled CUDA error"; + case ncclSystemError : return "System error"; + case ncclInternalError : return "Internal error"; + case ncclInvalidArgument : return "Invalid argument"; + case ncclInvalidUsage : return "Invalid usage"; + case ncclRemoteError : return "Remote process error"; + case ncclInProgress : return "NCCL operation in progress"; + default : return "Unexpected error"; + } +} + +// Converts the IP number of a NCCL address to a string (the port part is ignored and no DNS resolution is attempted). +static const char* ncclSocketToHost(const union ncclSocketAddress* addr, char* buf, size_t size) { + if (addr->sa.sa_family > 0) + return inet_ntop(addr->sa.sa_family, + (addr->sa.sa_family == AF_INET ? (void*)&addr->sin.sin_addr : (void*)&addr->sin6.sin6_addr), + buf, size); + else { + if (size > 0) + buf[0] = '\0'; + return buf; + } +} + +// Determines if the given count constitutes an outlier. +static bool rasCountIsOutlier(int count, bool verbose, int totalCount) { + if (count == 1) + return true; // A single rank is always considered an outlier... + if (verbose) { + return (totalCount != -1 ? count < totalCount * RAS_CLIENT_VERBOSE_OUTLIER_FRACTION : true); + } else { + return count <= RAS_CLIENT_DETAIL_THRESHOLD && + (totalCount == -1 || count <= totalCount * RAS_CLIENT_OUTLIER_FRACTION); + } +} diff --git a/src/ras/collectives.cc b/src/ras/collectives.cc new file mode 100644 index 000000000..201144f1a --- /dev/null +++ b/src/ras/collectives.cc @@ -0,0 +1,762 @@ +/************************************************************************* + * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#define NDEBUG // Comment out duriyng development only! +#include +#include + +#include "alloc.h" +#include "checks.h" +#include "comm.h" +#include "nccl.h" +#include "utils.h" +#include "ras_internal.h" + +// The number of recent collectives to keep track of. Completely arbitrary. +#define COLL_HISTORY_SIZE 64 + +// An entry in the rasCollHistory array keeping track of recently completed collectives (to make it possible to +// identify and drop duplicates arriving over different links). +struct rasCollHistoryEntry { + union ncclSocketAddress rootAddr; + uint64_t rootId; +}; + +// Array keeping track of recently completed collectives (to avoid infinite loops). LRU-based replacement. +static struct rasCollHistoryEntry rasCollHistory[COLL_HISTORY_SIZE]; +static int nRasCollHistory, rasCollHistNextIdx; + +// Monotonically increased to ensure that each collective originating locally has a unique Id. +static uint64_t rasCollLastId; + +// Array keeping track of ongoing collective operations (apart from broadcasts, which have no response so require +// no such tracking). +struct rasCollective* rasCollectives; +static int nRasCollectives; + +static ncclResult_t getNewCollEntry(struct rasCollective** pColl); +static ncclResult_t rasLinkSendCollReq(struct rasLink* link, struct rasCollective* coll, + const struct rasCollRequest* req, size_t reqLen, int fromConnIdx); +static ncclResult_t rasConnSendCollReq(struct rasConnection* conn, const struct rasCollRequest* req, size_t reqLen); +static ncclResult_t rasCollReadyResp(struct rasCollective* coll); +static ncclResult_t rasConnSendCollResp(struct rasConnection* conn, + const union ncclSocketAddress* rootAddr, uint64_t rootId, + const union ncclSocketAddress* peers, int nPeers, + const char* data, int nData, int nLegTimeouts); + +static ncclResult_t rasCollConnsInit(char** pData, int* pNData); +static ncclResult_t rasCollConnsMerge(struct rasCollective* coll, struct rasMsg* msg); + +static ncclResult_t rasCollCommsInit(char** pData, int* pNData); +static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* msg); +static int ncclCommsCompare(const void* p1, const void* p2); + + +/////////////////////////////////////////////////////////////////////////////////////// +// Functions related to the initialization of collectives and the message exchanges. // +/////////////////////////////////////////////////////////////////////////////////////// + +// Returns the index of the first available entry in the rasCollectives array, enlarging the array if necessary. +static ncclResult_t getNewCollEntry(struct rasCollective** pColl) { + struct rasCollective* coll; + int i; + for (i = 0; i < nRasCollectives; i++) + if (rasCollectives[i].type == RAS_MSG_NONE) + break; + if (i == nRasCollectives) { + NCCLCHECK(ncclRealloc(&rasCollectives, nRasCollectives, nRasCollectives+RAS_INCREMENT)); + nRasCollectives += RAS_INCREMENT; + } + + coll = rasCollectives+i; + memset(coll, '\0', sizeof(*coll)); + coll->startTime = clockNano(); + coll->fromConnIdx = -1; + // We are unlikely to use the whole array, but at least we won't need to realloc. + NCCLCHECK(ncclCalloc(&coll->fwdConns, nRasConns)); + + *pColl = coll; + return ncclSuccess; +} + +// Initializes a collective request by giving it a unique ID. +void rasCollReqInit(struct rasCollRequest* req) { + memcpy(&req->rootAddr, &rasNetListeningSocket.addr, sizeof(req->rootAddr)); + req->rootId = ++rasCollLastId; +} + +// Sends a collective request message through all regular RAS network connections (effectively, broadcasts it). +// Also used for re-broadcasts (on peers receiving the request over the network). +// Checking for duplicates is the responsibility of the caller. +// For collectives other than broadcasts, initializes a rasCollective structure and fills it with local data, +// in preparation for collective response messages. +// pAllDone indicates on return if the collective operation is already finished, which is unusual, but possible +// in scenarios such as a total of two peers. +// pCollIdx provides on return an index of the allocated rasCollective structure to track this collective (unless +// it's a broadcast, which require no such tracking). +ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, bool* pAllDone, int* pCollIdx, + int fromConnIdx) { + struct rasCollective* coll = nullptr; + if (req->type >= RAS_COLL_CONNS) { + // Keep track of this collective operation so that we can handle the responses appropriately. + NCCLCHECK(getNewCollEntry(&coll)); + if (pCollIdx) + *pCollIdx = coll-rasCollectives; + memcpy(&coll->rootAddr, &req->rootAddr, sizeof(coll->rootAddr)); + coll->rootId = req->rootId; + coll->type = req->type; + coll->timeout = req->timeout; + coll->fromConnIdx = fromConnIdx; + if (ncclCalloc(&coll->peers, 1) == ncclSuccess) { + memcpy(coll->peers, &rasNetListeningSocket.addr, sizeof(*coll->peers)); + coll->nPeers = 1; + } + + // Collective-specific initialization of accumulated data (using local data for now). + if (req->type == RAS_COLL_CONNS) + (void)rasCollConnsInit(&coll->data, &coll->nData); + else if (req->type == RAS_COLL_COMMS) + (void)rasCollCommsInit(&coll->data, &coll->nData); + } else { // req->type < RAS_COLL_CONNS + // Add the info to the collective message history. + nRasCollHistory = std::min(nRasCollHistory+1, COLL_HISTORY_SIZE); + memcpy(&rasCollHistory[rasCollHistNextIdx].rootAddr, &req->rootAddr, + sizeof(rasCollHistory[rasCollHistNextIdx].rootAddr)); + rasCollHistory[rasCollHistNextIdx].rootId = req->rootId; + rasCollHistNextIdx = (rasCollHistNextIdx + 1) % COLL_HISTORY_SIZE; + + // Collective-specific message handling. + if (req->type == RAS_BC_DEADPEER) { + bool done = false; + rasMsgHandleBCDeadPeer(req, &done); + if (done) + goto exit; + } + } // req->type < RAS_COLL_CONNS + + for (int connIdx = 0; connIdx < nRasConns; connIdx++) + rasConns[connIdx].linkFlag = false; + + (void)rasLinkSendCollReq(&rasNextLink, coll, req, reqLen, fromConnIdx); + (void)rasLinkSendCollReq(&rasPrevLink, coll, req, reqLen, fromConnIdx); + + if (coll && pAllDone) + *pAllDone = (coll->nFwdSent == coll->nFwdRecv); +exit: + return ncclSuccess; +} + +// Sends the collective message through all connections associated with this link (with the exception of the one +// the message came from, if any). +static ncclResult_t rasLinkSendCollReq(struct rasLink* link, struct rasCollective* coll, + const struct rasCollRequest* req, size_t reqLen, int fromConnIdx) { + for (int i = 0; i < link->nConns; i++) { + struct rasLinkConn* linkConn = link->conns+i; + if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) { + struct rasConnection* conn = rasConns+linkConn->connIdx; + if (!conn->linkFlag) { + // We send collective messages through fully established and operational connections only. + if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY && !conn->experiencingDelays) { + if (rasConnSendCollReq(conn, req, reqLen) == ncclSuccess && coll != nullptr) + coll->fwdConns[coll->nFwdSent++] = linkConn->connIdx; + } // if (conn->sockIdx != -1 && RAS_SOCK_READY) + conn->linkFlag = true; + } // if (!conn->linkFlag) + } // if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) + } // for (i) + + return ncclSuccess; +} + +// Sends a collective message down a particular connection. +static ncclResult_t rasConnSendCollReq(struct rasConnection* conn, const struct rasCollRequest* req, size_t reqLen) { + struct rasMsg* msg = nullptr; + int msgLen = rasMsgLength(RAS_MSG_COLLREQ) + reqLen; + + NCCLCHECK(rasMsgAlloc(&msg, msgLen)); + msg->type = RAS_MSG_COLLREQ; + memcpy(&msg->collReq, req, reqLen); + + rasConnEnqueueMsg(conn, msg, msgLen); + + return ncclSuccess; +} + +// Handles the RAS_MSG_COLLREQ collective message request on the receiver side. Primarily deals with duplicates and +// re-broadcasts the message to local peers, though in case of a very limited RAS network it might be done right away, +// in which case it can immediately send the response. +ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) { + bool allDone = false; + int collIdx = -1; + assert(sock->connIdx != -1); + + // First check if we've already handled this request (through another connection). + for (int i = 0; i < nRasCollHistory; i++) { + // In principle we can use i to index the array but we convert it so that we check the most recent entries first. + int collHistIdx = (rasCollHistNextIdx + COLL_HISTORY_SIZE - 1 - i) % COLL_HISTORY_SIZE; + if (memcmp(&msg->collReq.rootAddr, &rasCollHistory[collHistIdx].rootAddr, sizeof(msg->collReq.rootAddr)) == 0 && + msg->collReq.rootId == rasCollHistory[collHistIdx].rootId) { + if (msg->collReq.type >= RAS_COLL_CONNS) { + // Send an empty response so that the sender can account for it. The non-empty response has already been + // sent through the connection that we received the request through first. + NCCLCHECK(rasConnSendCollResp(rasConns+sock->connIdx, &msg->collReq.rootAddr, msg->collReq.rootId, + /*peers*/nullptr, /*nPeers*/0, /*data*/nullptr, /*nData*/0, /*nLegTimeouts*/0)); + } + goto exit; + } + } // for (i) + + if (msg->collReq.type >= RAS_COLL_CONNS) { + // Check if we're currently handling this collective request. + for (int i = 0; i < nRasCollectives; i++) { + struct rasCollective* coll = rasCollectives+i; + if (coll->type != RAS_MSG_NONE && + memcmp(&msg->collReq.rootAddr, &coll->rootAddr, sizeof(msg->collReq.rootAddr)) == 0 && + msg->collReq.rootId == coll->rootId) { + assert(msg->collReq.type == coll->type); + + // Send an empty response so that the sender can account for it. The non-empty response will be + // sent through the connection that we received the request through first. + NCCLCHECK(rasConnSendCollResp(rasConns+sock->connIdx, &msg->collReq.rootAddr, msg->collReq.rootId, + /*peers*/nullptr, /*nPeers*/0, /*data*/nullptr, /*nData*/0, /*nLegTimeouts*/0)); + goto exit; + } // if match + } // for (i) + } // if (msg->collReq.type >= RAS_COLL_CONNS) + + // Re-broadcast the message to my peers (minus the one it came from) and handle it locally. + NCCLCHECK(rasNetSendCollReq(&msg->collReq, rasCollDataLength(msg->collReq.type), &allDone, &collIdx, sock->connIdx)); + + if (msg->collReq.type >= RAS_COLL_CONNS && allDone) { + assert(collIdx != -1); + // We are a leaf process -- send the response right away. This can probably trigger only for the case of a total + // of two peers, and hence just one RAS connection, or during communication issues, because normally every peer + // has more than one connection so there should always be _some_ other peer to forward the request to. + NCCLCHECK(rasCollReadyResp(rasCollectives+collIdx)); + } +exit: + return ncclSuccess; +} + +// Sends a collective response back to the process we received the collective request from. +// Invoked when we are finished waiting for the collective responses from other peers (i.e., either there weren't +// any peers (unlikely), the peers sent their responses (likely), or we timed out. +static ncclResult_t rasCollReadyResp(struct rasCollective* coll) { + if (coll->fromConnIdx != -1) { + // For remotely-initiated collectives, send the response back. + NCCLCHECK(rasConnSendCollResp(rasConns+coll->fromConnIdx, &coll->rootAddr, coll->rootId, + coll->peers, coll->nPeers, coll->data, coll->nData, coll->nLegTimeouts)); + + // Add the identifying info to the collective message history. + nRasCollHistory = std::min(nRasCollHistory+1, COLL_HISTORY_SIZE); + memcpy(&rasCollHistory[rasCollHistNextIdx].rootAddr, &coll->rootAddr, + sizeof(rasCollHistory[rasCollHistNextIdx].rootAddr)); + rasCollHistory[rasCollHistNextIdx].rootId = coll->rootId; + rasCollHistNextIdx = (rasCollHistNextIdx + 1) % COLL_HISTORY_SIZE; + + rasCollFree(coll); + } else { + // For locally-initiated collectives, invoke the client code again (which will release it, once finished). + NCCLCHECK(rasClientResume(coll)); + } + return ncclSuccess; +} + +// Sends a collective response via the connection we originally received the request from. The message should be +// a cumulative response from this process and all the processes that we forwarded the request to. +static ncclResult_t rasConnSendCollResp(struct rasConnection* conn, + const union ncclSocketAddress* rootAddr, uint64_t rootId, + const union ncclSocketAddress* peers, int nPeers, + const char* data, int nData, int nLegTimeouts) { + struct rasMsg* msg = nullptr; + int msgLen = rasMsgLength(RAS_MSG_COLLRESP) + nPeers*sizeof(*peers); + int dataOffset = 0; + + if (nData > 0) { + ALIGN_SIZE(msgLen, alignof(int64_t)); + dataOffset = msgLen; + msgLen += nData; + } + + NCCLCHECK(rasMsgAlloc(&msg, msgLen)); + msg->type = RAS_MSG_COLLRESP; + memcpy(&msg->collResp.rootAddr, rootAddr, sizeof(msg->collResp.rootAddr)); + msg->collResp.rootId = rootId; + msg->collResp.nLegTimeouts = nLegTimeouts; + msg->collResp.nPeers = nPeers; + msg->collResp.nData = nData; + if (nPeers) + memcpy(msg->collResp.peers, peers, nPeers*sizeof(*msg->collResp.peers)); + if (nData) + memcpy(((char*)msg)+dataOffset, data, nData); + + rasConnEnqueueMsg(conn, msg, msgLen); + + return ncclSuccess; +} + +// Handles the collective response on the receiver side. Finds the corresponding rasCollective structure, merges +// the data from the response into the accumulated data. If all the responses have been accounted for, sends the +// accumulated response back. +ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock) { + int collIdx; + struct rasCollective* coll = nullptr; + char line[SOCKET_NAME_MAXLEN+1]; + + for (collIdx = 0; collIdx < nRasCollectives; collIdx++) { + coll = rasCollectives+collIdx; + if (coll->type != RAS_MSG_NONE && + memcmp(&msg->collResp.rootAddr, &coll->rootAddr, sizeof(msg->collResp.rootAddr)) == 0 && + msg->collResp.rootId == coll->rootId) + break; + } + if (collIdx == nRasCollectives) { + INFO(NCCL_RAS, "RAS failed to find a matching ongoing collective for response %s:%ld from %s!", + ncclSocketToString(&msg->collResp.rootAddr, line), msg->collResp.rootId, + ncclSocketToString(&sock->sock.addr, rasLine)); + goto exit; + } + + coll->nLegTimeouts += msg->collResp.nLegTimeouts; + assert(sock->connIdx != -1); + // Account for the received response in our collective operation tracking. + for (int i = 0; i < coll->nFwdSent; i++) { + if (coll->fwdConns[i] == sock->connIdx) { + coll->fwdConns[i] = -1; + break; + } + } + coll->nFwdRecv++; + if (msg->collResp.nData > 0) { + // Collective-specific merging of the response into locally accumulated data. + if (coll->type == RAS_COLL_CONNS) + NCCLCHECK(rasCollConnsMerge(coll, msg)); + else if (coll->type == RAS_COLL_COMMS) + NCCLCHECK(rasCollCommsMerge(coll, msg)); + } + // We merge the peers after merging the data, so that the data merge function can rely on peers being unchanged. + if (msg->collResp.nPeers > 0) { + NCCLCHECK(ncclRealloc(&coll->peers, coll->nPeers, coll->nPeers + msg->collResp.nPeers)); + memcpy(coll->peers+coll->nPeers, msg->collResp.peers, msg->collResp.nPeers * sizeof(*coll->peers)); + coll->nPeers += msg->collResp.nPeers; + } + + // If we received all the data we were waiting for, send our response back. + if (coll->nFwdSent == coll->nFwdRecv) + NCCLCHECK(rasCollReadyResp(coll)); +exit: + return ncclSuccess; +} + +// Removes a connection from all ongoing collectives. Called when a connection is experiencing a delay or is being +// terminated. +void rasCollsPurgeConn(int connIdx) { + for (int i = 0; i < nRasCollectives; i++) { + struct rasCollective* coll = rasCollectives+i; + if (coll->type != RAS_MSG_NONE) { + char line[SOCKET_NAME_MAXLEN+1]; + if (coll->fromConnIdx == connIdx) { + INFO(NCCL_RAS, "RAS purging collective %s:%ld because it comes from %s", + ncclSocketToString(&coll->rootAddr, line), coll->rootId, + ncclSocketToString(&rasConns[connIdx].addr, rasLine)); + rasCollFree(coll); + } else { + for (int j = 0; j < coll->nFwdSent; j++) { + if (coll->fwdConns[j] == connIdx) { + coll->fwdConns[j] = -1; + coll->nFwdRecv++; + coll->nLegTimeouts++; + INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld " + "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)", + ncclSocketToString(&rasConns[connIdx].addr, rasLine), ncclSocketToString(&coll->rootAddr, line), + coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts); + if (coll->nFwdSent == coll->nFwdRecv) + (void)rasCollReadyResp(coll); + break; + } + } // for (j) + } // coll->fromConnIdx != connIdx + } // !RAS_MSG_NONE + } // for (i) +} + +// Frees a rasCollective entry and any memory associated with it. +void rasCollFree(struct rasCollective* coll) { + free(coll->fwdConns); + coll->fwdConns = nullptr; + free(coll->peers); + coll->peers = nullptr; + free(coll->data); + coll->data = nullptr; + coll->fromConnIdx = -1; + coll->type = RAS_MSG_NONE; +} + +// Invoked from the main RAS thread loop to handle timeouts of the collectives. +// We obviously want to have a reasonable *total* timeout that the RAS client can rely on, but we don't have strict +// global coordination. So we have, in effect, two timeouts: soft (5s) and hard (10s). Soft equals the keep-alive +// timeout. +// When sending collective requests, we skip any connections that are experiencing delays. After the 5s timeout, we +// check again the status of all outstanding connections and if any is now delayed, we give up on it. +// That works fine for directly observable delays, but if the problematic connection is further away from us, all +// we can do is trust that the other peers will "do the right thing soon". However, if there is a cascade of +// problematic connections, they could still exceed the 5s total. So after 10s we give up waiting no matter what +// and send back whatever we have. Unfortunately, the peer that the RAS client is connected to will in all likelihood +// time out first, so at that point any delayed responses that eventually arrive are likely to be too late... +void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup) { + for (int collIdx = 0; collIdx < nRasCollectives; collIdx++) { + struct rasCollective* coll = rasCollectives+collIdx; + if (coll->type == RAS_MSG_NONE || coll->timeout == 0) + continue; + + if (now - coll->startTime > coll->timeout) { + // We've exceeded the leg timeout. For all outstanding responses, check their connections. + if (!coll->timeoutWarned) { + INFO(NCCL_RAS, "RAS collective %s:%ld timeout warning (%lds) -- %d responses missing", + ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId, + (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv); + coll->timeoutWarned = true; + } + for (int i = 0; i < coll->nFwdSent; i++) { + if (coll->fwdConns[i] != -1) { + struct rasConnection* conn = rasConns+coll->fwdConns[i]; + char line[SOCKET_NAME_MAXLEN+1]; + if (!conn->experiencingDelays && conn->sockIdx != -1) { + struct rasSocket* sock = rasSockets+conn->sockIdx; + // Ensure that the connection is fully established and operational, and that the socket hasn't been + // re-created during the handling of the collective (which would suggest that the request may have been + // lost). + if (sock->status == RAS_SOCK_READY && sock->createTime < coll->startTime) + continue; + } + // In all other cases we declare a timeout so that we can (hopefully) recover. + INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld " + "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)", + ncclSocketToString(&conn->addr, rasLine), ncclSocketToString(&coll->rootAddr, line), + coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts); + coll->fwdConns[i] = -1; + coll->nFwdRecv++; + coll->nLegTimeouts++; + } // if (coll->fwdConns[i] != -1) + } // for (i) + if (coll->nFwdSent == coll->nFwdRecv) { + (void)rasCollReadyResp(coll); + } else { + // At least some of the delays are *not* due to this process' connections experiencing delays, i.e., they + // must be due to delays at other processes. Presumably those processes will give up waiting soon and the + // (incomplete) responses will arrive shortly, so we should wait a little longer. + if (now - coll->startTime > coll->timeout + RAS_COLLECTIVE_EXTRA_TIMEOUT) { + // We've exceeded even the longer timeout, which is unexpected. Try to return whatever we have (though + // the originator of the collective, if it's not us, may have timed out already anyway). + INFO(NCCL_RAS, "RAS collective %s:%ld timeout error (%lds) -- giving up on %d missing responses", + ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId, + (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv); + coll->nLegTimeouts += coll->nFwdSent - coll->nFwdRecv; + coll->nFwdRecv = coll->nFwdSent; + (void)rasCollReadyResp(coll); + } else { + *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout+RAS_COLLECTIVE_EXTRA_TIMEOUT); + } + } // conn->nFwdRecv < conn->nFwdSent + } else { + *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout); + } + } // for (collIdx) +} + + +///////////////////////////////////////////////////////////////////////// +// Functions related to the handling of the RAS_COLL_CONNS collective. // +///////////////////////////////////////////////////////////////////////// + +// Initializes the accumulated data with just the local data for now. +// For this particular collective, we keep some reduced statistical data (min/max/avg travel time) as well +// as connection-specific info in case we observed a negative min travel time (which, ideally, shouldn't happen, +// but the system clocks may not be perfectly in sync). +static ncclResult_t rasCollConnsInit(char** pData, int* pNData) { + struct rasCollConns connsData = {.travelTimeMin = INT64_MAX, .travelTimeMax = INT64_MIN}; + struct rasCollConns* pConnsData; + + // Update the statistical data first and in the process also calculate how much connection-specific space we + // will need. + for (int i = 0; i < nRasConns; i++) { + struct rasConnection* conn = rasConns+i; + if (conn->inUse && conn->travelTimeCount > 0) { + if (connsData.travelTimeMin > conn->travelTimeMin) + connsData.travelTimeMin = conn->travelTimeMin; + if (connsData.travelTimeMax < conn->travelTimeMax) + connsData.travelTimeMax = conn->travelTimeMax; + connsData.travelTimeSum += conn->travelTimeSum; + connsData.travelTimeCount += conn->travelTimeCount; + connsData.nConns++; + if (conn->travelTimeMin < 0) + connsData.nNegativeMins++; + } + } + + *pNData = sizeof(connsData) + connsData.nNegativeMins*sizeof(*connsData.negativeMins); + NCCLCHECK(ncclCalloc(pData, *pNData)); + pConnsData = (struct rasCollConns*)*pData; + memcpy(pConnsData, &connsData, sizeof(*pConnsData)); + if (connsData.nNegativeMins > 0) { + for (int i = 0, negMinsIdx = 0; i < nRasConns; i++) { + struct rasConnection* conn = rasConns+i; + if (conn->inUse && conn->travelTimeMin < 0) { + struct rasCollConns::negativeMin* negativeMin = pConnsData->negativeMins+negMinsIdx; + memcpy(&negativeMin->source, &rasNetListeningSocket.addr, sizeof(negativeMin->source)); + memcpy(&negativeMin->dest, &conn->addr, sizeof(negativeMin->dest)); + negativeMin->travelTimeMin = conn->travelTimeMin; + negMinsIdx++; + } + assert(negMinsIdx <= connsData.nNegativeMins); + } + } + + return ncclSuccess; +} + +// Merges incoming collective RAS_COLL_CONNS response message into the local accumulated data. +static ncclResult_t rasCollConnsMerge(struct rasCollective* coll, struct rasMsg* msg) { + struct rasCollConns* collData; + struct rasCollConns* msgData; + int dataOffset = rasMsgLength(RAS_MSG_COLLRESP) + msg->collResp.nPeers*sizeof(*msg->collResp.peers); + ALIGN_SIZE(dataOffset, alignof(int64_t)); + + msgData = (struct rasCollConns*)(((char*)msg) + dataOffset); + collData = (struct rasCollConns*)coll->data; + + // Merge the stats. + if (collData->travelTimeMin > msgData->travelTimeMin) + collData->travelTimeMin = msgData->travelTimeMin; + if (collData->travelTimeMax < msgData->travelTimeMax) + collData->travelTimeMax = msgData->travelTimeMax; + collData->travelTimeSum += msgData->travelTimeSum; + collData->travelTimeCount += msgData->travelTimeCount; + collData->nConns += msgData->nConns; + + // Append the info about negative minimums. + if (msgData->nNegativeMins > 0) { + int nData = sizeof(*collData) + + (collData->nNegativeMins+msgData->nNegativeMins) * sizeof(*collData->negativeMins); + NCCLCHECK(ncclRealloc(&coll->data, coll->nData, nData)); + collData = (struct rasCollConns*)coll->data; + memcpy(coll->data+coll->nData, msgData->negativeMins, + msgData->nNegativeMins * sizeof(*collData->negativeMins)); + coll->nData = nData; + collData->nNegativeMins += msgData->nNegativeMins; + } + + return ncclSuccess; +} + + +///////////////////////////////////////////////////////////////////////// +// Functions related to the handling of the RAS_COLL_COMMS collective. // +///////////////////////////////////////////////////////////////////////// + +// Initializes the accumulated data with just the local data for now. +// For this particular collective, we keep for every communicator information about every rank, to help identify +// the missing ones and the discrepancies between the ones that did respond. +static ncclResult_t rasCollCommsInit(char** pData, int* pNData) { + struct rasCollComms* commsData; + int nComms = 0, nRanks = 0; + std::lock_guard lock(ncclCommsMutex); + + // Start by counting the communicators so that we know how much space to allocate. + // We also need to sort the comms array, to make the subsequent merging easier, both between the ranks (in case + // of multiple GPUs per process) and between the peers. + if (!ncclCommsSorted) { + qsort(ncclComms, nNcclComms, sizeof(*ncclComms), &ncclCommsCompare); + ncclCommsSorted = true; + } + for (int i = 0; i < nNcclComms; i++) { + if (ncclComms[i] == nullptr) // nullptr's are always at the end after sorting. + break; + if (i == 0) { + nComms = 1; + } else if (ncclComms[i]->commHash != ncclComms[i-1]->commHash) { + nComms++; + } + nRanks++; + } + + // rasNetCollCommsData has nested variable-length arrays, which makes the size calculation and subsequent + // pointer manipulations somewhat unwieldy... + *pNData = sizeof(*commsData) + nComms * sizeof(*commsData->comms) + nRanks * sizeof(*commsData->comms[0].ranks); + NCCLCHECK(ncclCalloc(pData, *pNData)); + commsData = (struct rasCollComms*)*pData; + commsData->nComms = nComms; + + // comm points at the space in the accumulated data where the info about the current communicator is to be stored. + struct rasCollComms::comm* comm = commsData->comms; + for (int i = 0; i < nNcclComms; i++) { + struct rasCollComms::comm::rank* rank; + ncclResult_t asyncError; + if (ncclComms[i] == nullptr) + break; + if (i == 0 || ncclComms[i]->commHash != ncclComms[i-1]->commHash) { + if (i > 0) + comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks)); + comm->commHash = ncclComms[i]->commHash; + comm->commNRanks = ncclComms[i]->nRanks; + comm->nRanks = 0; + } else if (ncclComms[i]->nRanks != ncclComms[i-1]->nRanks) { + INFO(NCCL_RAS, "RAS encountered inconsistent communicator data: size %d != %d -- " + "possible commHash collision (0x%lx)", ncclComms[i-1]->nRanks, ncclComms[i]->nRanks, comm->commHash); + continue; // Short of failing, the best we can do is skip... + } else if (ncclComms[i]->rank == ncclComms[i-1]->rank) { + INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible commHash collision (0x%lx)", + ncclComms[i]->rank, comm->commHash); + continue; // Short of failing, the best we can do is skip... + } + if (comm->nRanks == comm->commNRanks) { + INFO(NCCL_RAS, + "RAS encountered more ranks than the communicator size (%d) -- possible commHash collision (0x%lx)", + comm->commNRanks, comm->commHash); + continue; // Short of failing, the best we can do is skip... + } + rank = comm->ranks+comm->nRanks; + rank->commRank = ncclComms[i]->rank; + // rasNetSendCollReq initializes coll->peers[0] to our rasNetListeningSocket.addr, so peerIdx is initially + // always 0. It will increase after we send this response back to the peer we got the request from. + rank->peerIdx = 0; + rank->collOpCount = ncclComms[i]->collOpCount; + rank->status.initState = ncclComms[i]->initState; + if (ncclCommGetAsyncError(ncclComms[i], &asyncError) == ncclSuccess) + rank->status.asyncError = asyncError; + rank->status.finalizeCalled = (ncclComms[i]->finalizeCalled != 0); + rank->status.destroyFlag = (ncclComms[i]->destroyFlag != 0); + rank->status.abortFlag = (__atomic_load_n(ncclComms[i]->abortFlag, __ATOMIC_ACQUIRE) != 0); + rank->cudaDev = ncclComms[i]->cudaDev; + rank->nvmlDev = ncclComms[i]->nvmlDev; + comm->nRanks++; + } + assert(nComms == 0 || ((char*)(comm->ranks+comm->nRanks)) - (char*)commsData <= *pNData); + + return ncclSuccess; +} + +// Merges incoming collective RAS_COLL_COMMS response message into the local accumulated data. +static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* msg) { + struct rasCollComms* collData; + struct rasCollComms* msgData; + int dataOffset = rasMsgLength(RAS_MSG_COLLRESP) + msg->collResp.nPeers*sizeof(*msg->collResp.peers); + ALIGN_SIZE(dataOffset, alignof(int64_t)); + + msgData = (struct rasCollComms*)(((char*)msg) + dataOffset); + collData = (struct rasCollComms*)coll->data; + + if (msgData->nComms > 0) { + struct rasCollComms* newData = nullptr; + + // Allocate the new buffer pessimistically (sized as the sum of the two old ones). + NCCLCHECK(ncclCalloc((char**)&newData, coll->nData + msg->collResp.nData)); + struct rasCollComms::comm* collComm = collData->comms; + struct rasCollComms::comm* msgComm = msgData->comms; + struct rasCollComms::comm* newComm = newData->comms; + + for (int collIdx = 0, msgIdx = 0; collIdx < collData->nComms || msgIdx < msgData->nComms; newData->nComms++) { + int cmp; + if (collIdx < collData->nComms && msgIdx < msgData->nComms) + cmp = (collComm->commHash < msgComm->commHash ? -1 : (collComm->commHash > msgComm->commHash ? 1 : 0)); + else + cmp = (collIdx < collData->nComms ? -1 : 1); + + if (cmp == 0 && collComm->commNRanks != msgComm->commNRanks) { + INFO(NCCL_RAS, "RAS encountered inconsistent communicator data: size %d != %d -- " + "possible commHash collision (0x%lx)", collComm->commNRanks, msgComm->commNRanks, collComm->commHash); + cmp = (collComm->commNRanks < msgComm->commNRanks ? -1 : 1); + // We try to preserve both separately, although the input data might already be messed up anyway... + } + + if (cmp == 0) { + // Merge the comms. + newComm->commHash = collComm->commHash; + newComm->commNRanks = collComm->commNRanks; + if (collComm->nRanks + msgComm->nRanks > collComm->commNRanks) { + INFO(NCCL_RAS, + "RAS encountered more ranks (%d) than the communicator size (%d) -- possible commHash collision (0x%lx)", + collComm->nRanks + msgComm->nRanks, newComm->commNRanks, newComm->commHash); + // We'll skip the extras in the loop below. + } else { + newComm->nRanks = collComm->nRanks + msgComm->nRanks; + } + // Merge the ranks. + for (int newRankIdx = 0, collRankIdx = 0, msgRankIdx = 0; + collRankIdx < collComm->nRanks || msgRankIdx < msgComm->nRanks; + newRankIdx++) { + int cmpRank; + if (newRankIdx == newComm->commNRanks) + break; // Short of failing, the best we can do is skip... + if (collRankIdx < collComm->nRanks && msgRankIdx < msgComm->nRanks) + cmpRank = (collComm->ranks[collRankIdx].commRank < msgComm->ranks[msgRankIdx].commRank ? -1 : + (collComm->ranks[collRankIdx].commRank > msgComm->ranks[msgRankIdx].commRank ? 1 : 0)); + else + cmpRank = (collRankIdx < collComm->nRanks ? -1 : 1); + + // There shouldn't be any overlaps in ranks between different sources. + if (cmpRank == 0) { + INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible commHash collision (0x%lx)", + collComm->ranks[collRankIdx].commRank, newComm->commHash); + msgRankIdx++; // Short of failing, the best we can do is skip... + } + memcpy(newComm->ranks+newRankIdx, (cmpRank <= 0 ? collComm->ranks+collRankIdx++ : + msgComm->ranks+msgRankIdx++), sizeof(*newComm->ranks)); + if (cmpRank > 0) { + // peerIdx values from msgComm need to shift after merge. + newComm->ranks[newRankIdx].peerIdx += coll->nPeers; + } + } // for (newRankIdx) + newComm = (struct rasCollComms::comm*)(((char*)(newComm+1)) + newComm->nRanks * sizeof(*newComm->ranks)); + collComm = (struct rasCollComms::comm*)(((char*)(collComm+1)) + collComm->nRanks * sizeof(*collComm->ranks)); + collIdx++; + msgComm = (struct rasCollComms::comm*)(((char*)(msgComm+1)) + msgComm->nRanks * sizeof(*msgComm->ranks)); + msgIdx++; + } else if (cmp < 0) { + // Copy from collComm. + int commSize = sizeof(*collComm) + collComm->nRanks * sizeof(*collComm->ranks); + memcpy(newComm, collComm, commSize); + newComm = (struct rasCollComms::comm*)(((char*)(newComm)) + commSize); + collComm = (struct rasCollComms::comm*)(((char*)(collComm)) + commSize); + collIdx++; + } else { // cmp > 0 + // Copy from msgComm. + int commSize = sizeof(*msgComm) + msgComm->nRanks * sizeof(*msgComm->ranks); + memcpy(newComm, msgComm, commSize); + for (int i = 0; i < newComm->nRanks; i++) { + // peerIdx values from msgComm need to shift after merge. + newComm->ranks[i].peerIdx += coll->nPeers; + } + newComm = (struct rasCollComms::comm*)(((char*)(newComm)) + commSize); + msgComm = (struct rasCollComms::comm*)(((char*)(msgComm)) + commSize); + msgIdx++; + } // cmp > 0 + } // for (collIdx and msgIdx) + + free(coll->data); + coll->data = (char*)newData; + // newComm points at the next element beyond the last one -- exactly what we need. + coll->nData = ((char*)newComm) - (char*)newData; + } // if (msgData->nComms > 0) + + return ncclSuccess; +} + +// Sorting callback for the ncclComms array. +static int ncclCommsCompare(const void* p1, const void* p2) { + const ncclComm** pc1 = (const ncclComm**)p1; + const ncclComm** pc2 = (const ncclComm**)p2; + + // Put nullptr's at the end. + if (*pc1 == nullptr || *pc2 == nullptr) + return (*pc1 != nullptr ? -1 : (*pc2 != nullptr ? 1 : 0)); + + if ((*pc1)->commHash == (*pc2)->commHash) { + return ((*pc1)->rank < (*pc2)->rank ? -1 : ((*pc1)->rank > (*pc2)->rank ? 1 : 0)); + } else { + return ((*pc1)->commHash < (*pc2)->commHash ? -1 : 1); + } +} diff --git a/src/ras/peers.cc b/src/ras/peers.cc new file mode 100644 index 000000000..f2692d3e1 --- /dev/null +++ b/src/ras/peers.cc @@ -0,0 +1,960 @@ +/************************************************************************* + * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#define NDEBUG // Comment out during development only! +#include + +#include "alloc.h" +#include "checks.h" +#include "comm.h" +#include "nccl.h" +#include "ras_internal.h" + + +// All the known peer NCCL processes. The array is sorted by addr to ensure locality (within a node and hopefully +// also within a DC). The array may grow over time and it *includes* dead peers. +struct rasPeerInfo* rasPeers; +int nRasPeers; +// Hash of the rasPeers array, for figuring out when to sync with a remote peer. +uint64_t rasPeersHash; +// Index of this process within the rasPeers array (may change over time as the array grows). +static int myPeerIdx = -1; + +// Addresses of all the dead peers, sorted. In principle we could instead have a flag in rasPeerInfo for this, +// but we expect rasPeers to be largely static (and large at scale!) and rasDeadPeers to be fairly dynamic and +// much smaller, so we prefer to keep the dead info separately so that we don't end up sending the possibly large +// rasPeerInfo array around all the time. +union ncclSocketAddress* rasDeadPeers; +// The number of dead peers. +int nRasDeadPeers; +// The array size (may be larger than nRasDeadPeers). +static int rasDeadPeersSize; +// Hash of the rasDeadPeers array, for figuring out when to sync with a remote peer. +uint64_t rasDeadPeersHash; + +static ncclResult_t rasRanksConvertToPeers(struct rasRankInit* ranks, int nranks, + struct rasPeerInfo** rankPeers, int *nRankPeers, int* newNRasPeers); +static ncclResult_t rasPeersUpdate(struct rasPeerInfo* rankPeers, int* nRankPeers, int newNRasPeers = -1); + +static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers, + struct rasRankInit* ranks = nullptr, int nranks = 0, int fromConnIdx = -1); +static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct rasPeerInfo* newPeers, int nNewPeers, + bool updateDeadPeers, struct rasRankInit* ranks, int nranks, + int fromConnIdx); +static ncclResult_t rasConnPropagateUpdate(struct rasConnection* conn, const struct rasPeerInfo* newPeers, + int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks); +ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock); + +static ncclResult_t rasLinkReinitConns(struct rasLink* link); + +static ncclResult_t rasDeadPeersUpdate(union ncclSocketAddress* updatePeers, int* nUpdatePeers); +static ncclResult_t getNewDeadEntry(union ncclSocketAddress** pAddr); + +static int rasAddrRankInitCompare(const void* k, const void* e); +static int rasAddrPeerInfoCompare(const void* k, const void* e); +static int rasRanksCompare(const void* e1, const void* e2); + +static void rasPeersDump(); +static void rasDeadPeersDump(); +static char* rasPeerDump(const struct rasPeerInfo* peer, char* result, size_t nres); + + +///////////////////////////////////////////////////////////////////////////// +// Functions related to the handling of local RAS_ADD_RANKS notifications. // +///////////////////////////////////////////////////////////////////////////// + +// Handles RAS_ADD_RANKS notification -- adds new ranks to the internal list of all RAS peers, reconfigures RAS +// network connections, and notifies the peers. +ncclResult_t rasLocalHandleAddRanks(struct rasRankInit* ranks, int nranks) { + ncclResult_t ret = ncclSuccess; + + INFO(NCCL_RAS, "RAS handling local addRanks request (old nRasPeers %d)", nRasPeers); + + // Convert the input rasRankInit structures into our internal rasPeerInfo. + struct rasPeerInfo* rankPeers = nullptr; + int nRankPeers; + int newNRasPeers; + NCCLCHECKGOTO(rasRanksConvertToPeers(ranks, nranks, &rankPeers, &nRankPeers, &newNRasPeers), ret, fail); + + // Update local rasPeers. + NCCLCHECKGOTO(rasPeersUpdate(rankPeers, &nRankPeers, newNRasPeers), ret, fail); + + INFO(NCCL_RAS, "RAS finished local processing of addRanks request (new nRasPeers %d, nRankPeers %d)", + nRasPeers, nRankPeers); + // Print peers only if something changed and we're the "root". + if (nRankPeers > 0 && memcmp(&ranks[0].addr, &rasNetListeningSocket.addr, sizeof(ranks[0].addr)) == 0) + rasPeersDump(); + + // Propagate the changes through our RAS network links. + NCCLCHECKGOTO(rasNetUpdatePeers(rankPeers, nRankPeers, /*updateDeadPeers*/false, ranks, nranks), ret, fail); + +exit: + if (rankPeers) + free(rankPeers); + free(ranks); + return ret; +fail: + goto exit; +} + +// Converts the rasRankInit structure into rasPeerInfo. This skips empty elements (in case of errors), orders +// elements by the address/cudaDev, and merges elements with duplicate addresses (in case of multiple CUDA devices per +// process). In the process we also calculate how large the merged rasPeers array will need to be. +static ncclResult_t rasRanksConvertToPeers(struct rasRankInit* ranks, int nranks, + struct rasPeerInfo** rankPeers, int *nRankPeers, int* newNRasPeers) { + ncclResult_t ret = ncclSuccess; + int peerIdx, rankPeerIdx; + + // Handy when checking for empty (in case of errors) addresses. + union ncclSocketAddress emptyAddr; + memset(&emptyAddr, '\0', sizeof(emptyAddr)); + + // Begin by sorting the array by address and cudaDev (to match the rasPeers order). + qsort(ranks, nranks, sizeof(*ranks), &rasRanksCompare); + + // We over-allocate peers here because to get an accurate count we would need to loop over the ranks first... + // nRankPeers will hold the actual count of used elements. + *rankPeers = nullptr; + NCCLCHECKGOTO(ncclCalloc(rankPeers, nranks), ret, fail); + + peerIdx = rankPeerIdx = 0; + *newNRasPeers = nRasPeers; + for (int rankIdx = 0; rankIdx < nranks; rankIdx++) { + const struct rasRankInit* rank = ranks+rankIdx; + struct rasPeerInfo* rankPeer = *rankPeers+rankPeerIdx; + + if (memcmp(&emptyAddr, &rank->addr, sizeof(emptyAddr)) == 0) { + // Skip empty rank entries. + continue; + } + + // First check if the rank doesn't need to be merged into the previous entry in rankPeers + // (possible if there are multiple ranks with the same address). + if (rankPeerIdx > 0 && memcmp(&rank->addr, &rankPeer[-1].addr, sizeof(rank->addr)) == 0) { + // Merge into the previous entry in peers. + rankPeer[-1].cudaDevs |= (1UL << rank->cudaDev); + rankPeer[-1].nvmlDevs |= (1UL << rank->nvmlDev); + continue; + } + + // Add a new entry to rankPeers. + assert(rankPeerIdx < nranks); + memcpy(&rankPeer->addr, &rank->addr, sizeof(rankPeer->addr)); + rankPeer->pid = rank->pid; + rankPeer->cudaDevs = (1UL << rank->cudaDev); + rankPeer->nvmlDevs = (1UL << rank->nvmlDev); + rankPeerIdx++; + + // Also check if there is already an entry with that address in the global rasPeers so that the caller can know how + // many more entries will be needed. + const struct rasPeerInfo* rasPeer = rasPeers+peerIdx; + int cmp = 0; + while (peerIdx < nRasPeers) { + cmp = ncclSocketsCompare(&rank->addr, &rasPeer->addr); + if (cmp <= 0) + break; + peerIdx++; + rasPeer++; + } + if (peerIdx == nRasPeers) { + // The current rank is "greater than" all existing peers, so it will need a new entry. We stay in the loop so + // that we don't need to handle the remaining ranks separately. + (*newNRasPeers)++; + continue; + } + if (cmp < 0) { + (*newNRasPeers)++; + } else { + // Duplicates (cmp == 0) between the rank array and the peers array will be merged. + assert(rank->pid == rasPeer->pid); + } + } + assert(peerIdx <= nRasPeers); + *nRankPeers = rankPeerIdx; + +exit: + return ret; +fail: + if (*rankPeers) { + free(*rankPeers); + *rankPeers = nullptr; + } + goto exit; +} + +// Updates the rasPeers array with the new data. The new data gets updated in the process as well: any data that +// wasn't actually new is purged, so as to minimize the amount of data we forward to our peers. +// On a successful return, nRankPeers contains the number of entries that were updated. +static ncclResult_t rasPeersUpdate(struct rasPeerInfo* rankPeers, int* nRankPeers, int newNRasPeers) { + ncclResult_t ret = ncclSuccess; + int rankPeerIdxDst; + int rankPeerIdx, peerIdx; + + if (newNRasPeers == -1) { + // First calculate the new size of rasPeers. + newNRasPeers = nRasPeers; + for (rankPeerIdx = peerIdx = 0; rankPeerIdx < *nRankPeers; rankPeerIdx++) { + struct rasPeerInfo* rankPeer = rankPeers+rankPeerIdx; + struct rasPeerInfo* rasPeer = rasPeers+peerIdx; + int cmp = 1; + + while (peerIdx < nRasPeers) { + cmp = ncclSocketsCompare(&rankPeer->addr, &rasPeer->addr); + + if (cmp < 0) { + // rankPeer will go in front of rasPeer. + newNRasPeers++; + break; + } + + peerIdx++; + rasPeer++; + + if (cmp == 0) + break; + } + if (cmp > 0) // No more rasPeer entries -- rankPeer will go at the end. + newNRasPeers++; + } + } + + // If needed, allocate a new, larger rasPeers array. + struct rasPeerInfo* newRasPeers; + int myNewPeerIdx; + if (newNRasPeers > nRasPeers) { + NCCLCHECKGOTO(ncclCalloc(&newRasPeers, newNRasPeers), ret, fail); + } else { + newRasPeers = rasPeers; + } + + // Now merge the rankPeers into newRasPeers. In the process, modify rankPeers to become a "diff" between + // the old rasPeers and newRasPeers -- this will be the data structure to broadcast on the RAS network. + myNewPeerIdx = -1; + int newPeerIdx; + for (newPeerIdx = rankPeerIdx = peerIdx = 0; rankPeerIdx < *nRankPeers || peerIdx < nRasPeers;) { + struct rasPeerInfo* rankPeer = rankPeers+rankPeerIdx; + struct rasPeerInfo* rasPeer = rasPeers+peerIdx; + struct rasPeerInfo* newRasPeer = newRasPeers+newPeerIdx; + + if (rankPeerIdx < *nRankPeers) { + if (peerIdx < nRasPeers) { + int cmp = ncclSocketsCompare(&rankPeer->addr, &rasPeer->addr); + + if (cmp < 0) { + // rankPeer needs to occur before rasPeer -- that's possible only if we are adding new entries. + assert(newRasPeers != rasPeers); + // Add new entry to newRasPeers. + assert(newPeerIdx < newNRasPeers); + memcpy(newRasPeer, rankPeer, sizeof(*newRasPeer)); + newPeerIdx++; + rankPeerIdx++; + } + else { + // cmp >= 0 -- Start by copying peer to newRasPeer, if needed. + if (newRasPeers != rasPeers) { + assert(newPeerIdx < newNRasPeers); + memcpy(newRasPeer, rasPeer, sizeof(*newRasPeer)); + } + else { // in-place + assert(newRasPeer == rasPeer); + } + + if (cmp == 0) { + // The address of rankPeer is the same as that of newRasPeer -- merge into it. + // First though calculate what GPUs from rankPeer are actually new (if any). + uint64_t newDevs = rankPeer->cudaDevs & ~newRasPeer->cudaDevs; + newRasPeer->cudaDevs |= rankPeer->cudaDevs; + // Update rankPeer->devs with the newly added devs only -- we'll clean it up at the end. + rankPeer->cudaDevs = newDevs; + // Repeat for nvmlDevs... + newDevs = rankPeer->nvmlDevs & ~newRasPeer->nvmlDevs; + newRasPeer->nvmlDevs |= rankPeer->nvmlDevs; + rankPeer->nvmlDevs = newDevs; + rankPeerIdx++; + } + // Given that we might've added new entries, we need to update myPeerIdx as well. + if (myPeerIdx == peerIdx) + myNewPeerIdx = newPeerIdx; + peerIdx++; + newPeerIdx++; + } + } else { // peerIdx == nRasPeers + // No more rasPeers -- add a new entry based on rank. + assert(newPeerIdx < newNRasPeers); + memcpy(newRasPeer, rankPeer, sizeof(*newRasPeer)); + // If this is the first time this function is run, myPeerIdx will need to be set. It's more work in that + // case as we need to compare the addresses of each peer until we find one. + if (myPeerIdx == -1 && memcmp(&newRasPeer->addr, &rasNetListeningSocket.addr, sizeof(newRasPeer->addr)) == 0) + myNewPeerIdx = newPeerIdx; + newPeerIdx++; + rankPeerIdx++; + } + } else { // rankPeerIdx == *nRankPeers + // No more rankPeers -- copy the rasPeer over if needed. + if (newRasPeers != rasPeers) { + assert(newPeerIdx < newNRasPeers); + memcpy(newRasPeer, rasPeer, sizeof(*newRasPeer)); + } + else { // in-place at the end. + assert(newRasPeer == rasPeer); + } + if (myPeerIdx == peerIdx) + myNewPeerIdx = newPeerIdx; + peerIdx++; + newPeerIdx++; + } + } + assert(newPeerIdx == newNRasPeers); + + if (newRasPeers != rasPeers) { + if (rasPeers) + free(rasPeers); + rasPeers = newRasPeers; + nRasPeers = newNRasPeers; + assert(myNewPeerIdx != -1); + myPeerIdx = myNewPeerIdx; + } else { + assert(myNewPeerIdx == myPeerIdx); + } + rasPeersHash = getHash((const char*)rasPeers, nRasPeers*sizeof(*rasPeers)); + + // Purge from rankPeers all entries that didn't actually contribute any new GPUs. + for (rankPeerIdx = rankPeerIdxDst = 0; rankPeerIdx < *nRankPeers; rankPeerIdx++) { + struct rasPeerInfo* rankPeer = rankPeers+rankPeerIdx; + if (rankPeer->cudaDevs != 0) { + if (rankPeerIdxDst != rankPeerIdx) { + memcpy(rankPeers+rankPeerIdxDst, rankPeer, sizeof(*rankPeers)); + } + rankPeerIdxDst++; + } + } + assert(rankPeerIdxDst <= *nRankPeers); + *nRankPeers = rankPeerIdxDst; + +exit: + return ret; +fail: + goto exit; +} + +// Searches through rasPeers given the peer address. Returns the index of the found entry in the rasPeers +// array or -1 if not found. +int rasPeerFind(const union ncclSocketAddress* addr) { + struct rasPeerInfo* peer = (struct rasPeerInfo*)bsearch(addr, rasPeers, nRasPeers, sizeof(*rasPeers), + rasAddrPeerInfoCompare); + return (peer ? peer-rasPeers : -1); +} + + +///////////////////////////////////////////////////////////////////////////////// +// Functions related to the propagation of peers updates over the RAS network. // +///////////////////////////////////////////////////////////////////////////////// + +// Propagates information about new peers through the RAS network links. +// ranks -- if provided -- lists all the peers who are already aware of this update (because they are the members +// of the new communicator being established), and who thus don't need to be notified. updatedDeadPeers can +// be used, however, to request at least the propagation of rasDeadPeers to such peers. +// fromConnIdx -- if provided -- identified the connection used to receive this update; there's no need to +// propagate the update back through it. +// Reconfigures the RAS network to accommodate the newly added peers, by modifying the links and establishing new +// connections as needed. +static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers, + struct rasRankInit* ranks, int nranks, int fromConnIdx) { + ncclResult_t ret = ncclSuccess; + + // Do we actually have anything to do? + if (nNewPeers == 0 && !updateDeadPeers) + goto exit; + + // Start by propagating the update through the RAS network links. We consider any errors during this process + // to be non-fatal (we can re-sync later around a keep-alive exchange). + (void)rasLinkPropagateUpdate(&rasNextLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConnIdx); + (void)rasLinkPropagateUpdate(&rasPrevLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConnIdx); + + // Calculate new link peers and open new connections if needed. + NCCLCHECKGOTO(rasLinkReinitConns(&rasNextLink), ret, fail); + NCCLCHECKGOTO(rasLinkReinitConns(&rasPrevLink), ret, fail); + +exit: + return ret; +fail: + goto exit; +} + +// Sends a peers update through all the connections associated with a particular link. See rasNetUpdatePeers +// for the explanation of the function arguments. +static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct rasPeerInfo* newPeers, int nNewPeers, + bool updateDeadPeers, struct rasRankInit* ranks, int nranks, + int fromConnIdx) { + for (int i = 0; i < link->nConns; i++) { + struct rasLinkConn* linkConn = link->conns+i; + // Note that we don't send the update via the connection that we received this notification from in the first + // place (while it wouldn't loop indefinitely, it would add a needless extra exchange). + if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) { + struct rasConnection* conn = rasConns+linkConn->connIdx; + // Failed propagations are not considered fatal (we will retry after a keep-alive). + (void)rasConnPropagateUpdate(conn, newPeers, nNewPeers, updateDeadPeers, ranks, nranks); + } + } + + return ncclSuccess; +} + +// Sends a peers update down a particular connection. See rasNetUpdatePeers for the explanation of the function +// arguments. +static ncclResult_t rasConnPropagateUpdate(struct rasConnection* conn, const struct rasPeerInfo* newPeers, + int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks) { + if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY) { + // If we have the rank info, check if the peer on the other side of this connection has participated in the new + // communicator. + int connRank = -1; + if (ranks && !updateDeadPeers) { + struct rasRankInit* rank = (struct rasRankInit*)bsearch(&conn->addr, ranks, nranks, sizeof(*ranks), + rasAddrRankInitCompare); + if (rank) + connRank = rank-ranks; + } + if (connRank < 0) { + // It did not participate or we don't know -- we should send an update to that peer then. + NCCLCHECK(rasConnSendPeersUpdate(conn, newPeers, nNewPeers)); + } + } + + return ncclSuccess; +} + +// Sends a RAS_MSG_PEERSUPDATE message, which can include both the rasPeers (preferably only the newly added peers +// rather than the complete rasPeers array, to save on the network bandwidth) and rasDeadPeers (sent in its entirety +// if at all, as it's assumed to be a lot smaller than rasPeers). +ncclResult_t rasConnSendPeersUpdate(struct rasConnection* conn, const struct rasPeerInfo* peers, int nPeers) { + struct rasMsg* msg = nullptr; + int msgLen; + int deadPeersOffset = 0; + int nDeadPeers; + + if (conn->lastSentPeersHash == rasPeersHash || conn->lastRecvPeersHash == rasPeersHash) { + nPeers = 0; + } + if (conn->lastSentDeadPeersHash == rasDeadPeersHash || conn->lastRecvDeadPeersHash == rasDeadPeersHash) { + nDeadPeers = 0; + } else { + // We expect the rasDeadPeers array to be much smaller than rasPeers so if we send it, we send it in full. + nDeadPeers = nRasDeadPeers; + } + + if (nPeers == 0 && nDeadPeers == 0) + goto exit; + + msgLen = rasMsgLength(RAS_MSG_PEERSUPDATE) + nPeers*sizeof(*peers); + if (nDeadPeers > 0) { + ALIGN_SIZE(msgLen, alignof(union ncclSocketAddress)); + deadPeersOffset = msgLen; + msgLen += nDeadPeers*sizeof(*rasDeadPeers); + } + + NCCLCHECK(rasMsgAlloc(&msg, msgLen)); + msg->type = RAS_MSG_PEERSUPDATE; + msg->peersUpdate.peersHash = rasPeersHash; + msg->peersUpdate.nPeers = nPeers; + msg->peersUpdate.deadPeersHash = rasDeadPeersHash; + msg->peersUpdate.nDeadPeers = nDeadPeers; + memcpy(msg->peersUpdate.peers, peers, nPeers * sizeof(msg->peersUpdate.peers[0])); + memcpy(((char*)msg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers)); + + if (nPeers > 0) + conn->lastSentPeersHash = rasPeersHash; + if (nDeadPeers > 0) + conn->lastSentDeadPeersHash = rasDeadPeersHash; + + INFO(NCCL_RAS, "RAS sending a peersUpdate to %s (nPeers %d, nDeadPeers %d)", + ncclSocketToString(&conn->addr, rasLine), nPeers, nDeadPeers); + + rasConnEnqueueMsg(conn, msg, msgLen); +exit: + return ncclSuccess; +} + +// Handles the RAS_MSG_PEERSUPDATE message on the receiver side. The received data is merged into the local +// rasPeers and rasDeadPeers arrays. If the checksums of the resulting arrays don't match those from the message, +// sends its own RAS_MSG_PEERSUPDATE back to the source, to ensure a sync. +// Subsequently propagates the update to its own peers. +ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock) { + ncclResult_t ret = ncclSuccess; + struct rasMsg* newMsg = nullptr; + int newMsgLen = 0; + assert(sock->connIdx != -1); + struct rasConnection* conn = rasConns+sock->connIdx; + int nPeers, nDeadPeers; + int deadPeersOffset = 0; + bool updatePeers, updateDeadPeers; + + INFO(NCCL_RAS, "RAS handling peersUpdate from %s (peersHash 0x%lx, deadPeersHash 0x%lx, nPeers %d, nDeadPeers %d)", + ncclSocketToString(&sock->sock.addr, rasLine), msg->peersUpdate.peersHash, msg->peersUpdate.deadPeersHash, + msg->peersUpdate.nPeers, msg->peersUpdate.nDeadPeers); + INFO(NCCL_RAS, "RAS my old rasPeersHash 0x%lx, rasDeadPeersHash 0x%lx, nRasPeers %d, nRasDeadPeers %d", + rasPeersHash, rasDeadPeersHash, nRasPeers, nRasDeadPeers); + conn->lastRecvPeersHash = msg->peersUpdate.peersHash; + conn->lastRecvDeadPeersHash = msg->peersUpdate.deadPeersHash; + + // Prepare ours to send back. We don't enqueue it right away because we want to make sure first that we need + // to send it. We'll find out by comparing the hash values after the merge. + // We want to prepare the message pre-merge though because post-merge it will include the just received new peers, + // and it's pointless to send those back to where they just came from. + // nPeers and nDeadPeers are used primarily for message length calculations, so they have to assume the worst-case + // scenario (e.g., no overlap in case of nDeadPeers). + nPeers = (msg->peersUpdate.peersHash != rasPeersHash ? nRasPeers : 0); + nDeadPeers = (msg->peersUpdate.deadPeersHash != rasDeadPeersHash ? nRasDeadPeers+msg->peersUpdate.nDeadPeers : 0); + if (nPeers > 0 || nDeadPeers > 0) { + newMsgLen = rasMsgLength(RAS_MSG_PEERSUPDATE) + nPeers*sizeof(*rasPeers); + if (nDeadPeers > 0) { + ALIGN_SIZE(newMsgLen, alignof(union ncclSocketAddress)); + newMsgLen += nDeadPeers*sizeof(*rasDeadPeers); + } + NCCLCHECKGOTO(rasMsgAlloc(&newMsg, newMsgLen), ret, fail); + newMsg->type = RAS_MSG_PEERSUPDATE; + // Note that after rasPeersUpdate below we may still decide not to send the peers. + memcpy(newMsg->peersUpdate.peers, rasPeers, nPeers * sizeof(newMsg->peersUpdate.peers[0])); + newMsg->peersUpdate.nPeers = nPeers; + + if (nDeadPeers > 0) { + // Calculate the offset where dead peers are stored in the received message. We do it before the peers + // update because it could modify msg->peersUpdate.nPeers... + deadPeersOffset = rasMsgLength(RAS_MSG_PEERSUPDATE) + msg->peersUpdate.nPeers * sizeof(msg->peersUpdate.peers[0]); + ALIGN_SIZE(deadPeersOffset, alignof(union ncclSocketAddress)); + } + + if (nPeers > 0) + NCCLCHECKGOTO(rasPeersUpdate(msg->peersUpdate.peers, &msg->peersUpdate.nPeers), ret, fail); + else + msg->peersUpdate.nPeers = 0; + if (nDeadPeers > 0) + NCCLCHECKGOTO(rasDeadPeersUpdate((union ncclSocketAddress*)(((char*)msg)+deadPeersOffset), + &msg->peersUpdate.nDeadPeers), ret, fail); + else + msg->peersUpdate.nDeadPeers = 0; + + INFO(NCCL_RAS, "RAS finished local processing of peersUpdate " + "(new nRasPeers %d, nRasDeadPeers %d, nPeers %d, nDeadPeers %d)", + nRasPeers, nRasDeadPeers, msg->peersUpdate.nPeers, msg->peersUpdate.nDeadPeers); + if (msg->peersUpdate.nPeers > 0) + rasPeersDump(); + if (msg->peersUpdate.nDeadPeers > 0) + rasDeadPeersDump(); + + // If post-merge the hashes are still different, send our (dead) peers back. + updatePeers = (conn->lastSentPeersHash != rasPeersHash && conn->lastRecvPeersHash != rasPeersHash); + updateDeadPeers = (conn->lastSentDeadPeersHash != rasDeadPeersHash && + conn->lastRecvDeadPeersHash != rasDeadPeersHash); + if (updatePeers || updateDeadPeers) { + newMsg->peersUpdate.peersHash = rasPeersHash; + newMsg->peersUpdate.deadPeersHash = rasDeadPeersHash; + if (updatePeers) { + assert(nPeers > 0); + conn->lastSentPeersHash = rasPeersHash; + } else { + // If hashes match, make sure that we don't send the rasPeers back. + newMsg->peersUpdate.nPeers = 0; + } + + // We need to recalculate the message size from scratch now that both rasPeers and rasDeadPeers may have changed. + newMsgLen = rasMsgLength(RAS_MSG_PEERSUPDATE) + newMsg->peersUpdate.nPeers * sizeof(*rasPeers); + + if (updateDeadPeers) { + assert(nRasDeadPeers > 0); + conn->lastSentDeadPeersHash = rasDeadPeersHash; + + ALIGN_SIZE(newMsgLen, alignof(union ncclSocketAddress)); + deadPeersOffset = newMsgLen; + newMsgLen += nRasDeadPeers*sizeof(*rasDeadPeers); + + memcpy(((char*)newMsg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers)); + conn->lastSentDeadPeersHash = rasDeadPeersHash; + newMsg->peersUpdate.nDeadPeers = nRasDeadPeers; + } else { + newMsg->peersUpdate.nDeadPeers = 0; + } + + INFO(NCCL_RAS, "RAS sending back a peersUpdate (nPeers %d, nDeadPeers %d)", + newMsg->peersUpdate.nPeers, newMsg->peersUpdate.nDeadPeers); + + rasConnEnqueueMsg(conn, newMsg, newMsgLen); + newMsg = nullptr; + } // if (updatePeers || updateDeadPeers) + + // Propagate the changes through our RAS network links. + NCCLCHECKGOTO(rasNetUpdatePeers(msg->peersUpdate.peers, msg->peersUpdate.nPeers, updateDeadPeers, nullptr, 0, + sock->connIdx), ret, fail); + } + +exit: + rasMsgFree(newMsg); + return ret; +fail: + goto exit; +} + + +////////////////////////////////////////////////////////////////////////////////////////// +// Functions related to the (re-)configuration of RAS connections after a peers update. // +////////////////////////////////////////////////////////////////////////////////////////// + +// Reinitializes the connection(s) of a particular link, following a peers update. +// Adding new peers can affect the calculation of the link's primary connection and also the fallbacks. +// The newly added peers could also shift all the existing peerIdx values, invalidating the values in RasLinkConn +// structures, so it's better to drop it all and recalculate from scratch. +// We recalculate the primary peer; if an active connection to it already exists, then we're done. If there +// is no connection, we create one. If a connection exists but is experiencing delays then we add a fallback and +// the process repeats. +// External conns are dropped from the links as well (they will be re-created via keepAlive messages as needed). +static ncclResult_t rasLinkReinitConns(struct rasLink* link) { + struct rasLinkConn* linkConn; + struct rasConnection* conn = nullptr; + int newPeerIdx = myPeerIdx; + + if (link->connsSize == 0) { + link->connsSize = RAS_INCREMENT; + NCCLCHECK(ncclCalloc(&link->conns, link->connsSize)); + } + link->nConns = 0; + + // Establish a connection for this link. We iterate as long as the connections we find are experiencing delays. + while (newPeerIdx != -1) { + if (link->nConns == link->connsSize) { + NCCLCHECK(ncclRealloc(&link->conns, link->connsSize, link->connsSize+RAS_INCREMENT)); + link->connsSize += RAS_INCREMENT; + } + + newPeerIdx = rasLinkCalculatePeer(link, newPeerIdx, /*isFallback*/link->nConns > 1); + if (newPeerIdx == -1) { + INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (nConns %d)", link->direction, link->nConns); + if (link->nConns > 0) + break; + } + linkConn = link->conns+link->nConns; + linkConn->peerIdx = newPeerIdx; + linkConn->connIdx = (newPeerIdx != -1 ? rasConnFind(&rasPeers[newPeerIdx].addr) : -1); + linkConn->external = false; + + // If the calculated connection does not exist, then we are at the end of the chain and this is the last iteration. + // Depending on the circumstances, we may first need to create that connection. + if (linkConn->connIdx == - 1) { + if (link->nConns == 0) { + if (linkConn->peerIdx != -1) { + INFO(NCCL_RAS, "RAS link %d: %s primary connection with %s", + link->direction, (myPeerIdx < linkConn->peerIdx ? "opening new" : "calculated deferred"), + ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine)); + // We try to initiate primary connections from the side with a lower address (and thus an earlier peer index) + // to avoid races and the creation of duplicate connections. + if (myPeerIdx < linkConn->peerIdx) { + NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->connIdx)); + } + else { // If we didn't initiate the connection, start the timeout. + link->lastUpdatePeersTime = clockNano(); + } + } // if (linkConn->peerIdx != -1) + } else { // link->nConns > 0 + INFO(NCCL_RAS, "RAS link %d: opening new fallback connection %d with %s", + link->direction, link->nConns, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine)); + NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &linkConn->connIdx)); + } // link->nConns > 0 + } else { // linkConn->connIdx != -1 + if (link->nConns == 0) { + INFO(NCCL_RAS, "RAS link %d: calculated existing primary connection with %s", + link->direction, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine)); + } else { + INFO(NCCL_RAS, "RAS link %d: calculated existing fallback connection %d with %s", + link->direction, link->nConns, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine)); + } + } + link->nConns++; + if (linkConn->connIdx == -1) + break; + conn = rasConns+linkConn->connIdx; + + // We check if the connection already went through the fallback calculation; if so, we'll need to create a new + // fallback in the next iteration, to ensure that RAS will keep retrying. + if (!conn->experiencingDelays) + break; + + INFO(NCCL_RAS, "RAS connection experiencingDelays %d, startRetryTime %.2fs, socket status %d", + conn->experiencingDelays, (clockNano()-conn->startRetryTime)/1e9, + (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status)); + } + + return ncclSuccess; +} + +// Calculates the index of the peer on the RAS network. Can also be used to calculate the index of the next fallback +// peer. +// In the simplest case we want to try the "next closest" fallback, although we still need to check for and skip +// any dead peers. +// For fallbacks to fallbacks, we also apply a more pessimistic policy. We skip all the remaining RAS threads that +// are on the same node as the previous fallback (unless it's the same node that we're running on or we have strong +// indications that the node is up). We do that to avoid having to excessively wait iterating through, say, 8 +// processes when a whole node might be down. +int rasLinkCalculatePeer(const struct rasLink* link, int peerIdx, bool isFallback) { + int newPeerIdx = (peerIdx + link->direction + nRasPeers) % nRasPeers; + do { + if (isFallback && !ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasNetListeningSocket.addr)) { + // peerIdx is a fallback and it is not running on the same node as us. + int tryPeerIdx = newPeerIdx; + int tryConnIdx = -1; + + // Try to skip the remaining peers on the same node as peerIdx. We may end up skipping over some peers that + // are alive, which is fine -- they will still have connectivity with the rest of the RAS network, just a + // little suboptimal one. + while (ncclSocketsSameNode(&rasPeers[tryPeerIdx].addr, &rasPeers[peerIdx].addr)) { + if (!rasPeerIsDead(&rasPeers[tryPeerIdx].addr)) { + tryConnIdx = rasConnFind(&rasPeers[tryPeerIdx].addr); + if (tryConnIdx != -1) { + struct rasConnection* tryConn = rasConns+tryConnIdx; + // Check if the connection is fully established and operational, i.e., if the underlying socket + // is ready and there's been recent communication on it. + if (tryConn->sockIdx != -1 && rasSockets[tryConn->sockIdx].status == RAS_SOCK_READY && + !tryConn->experiencingDelays) { + // We convinced ourselves that the node is not down. We don't adjust newPeerIdx in + // this case. This is the only case when tryConnIdx != -1 after this loop. + break; + } + } // if (tryConnIdx != -1) + } // if (!rasPeerIsDead(&rasPeers[tryPeerIdx].addr)) + + tryConnIdx = -1; + tryPeerIdx = (tryPeerIdx + nRasPeers + link->direction) % nRasPeers; + if (tryPeerIdx == myPeerIdx) + break; + } + + if (tryConnIdx == -1) + newPeerIdx = tryPeerIdx; + if (tryPeerIdx == myPeerIdx) + break; + } // if (isFallback && !ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasNetListeningSocket.addr)) + + if (rasPeerIsDead(&rasPeers[newPeerIdx].addr)) { + newPeerIdx = (newPeerIdx + nRasPeers + link->direction) % nRasPeers; + } + else + break; + } while (newPeerIdx != myPeerIdx); + + return (newPeerIdx != myPeerIdx ? newPeerIdx : -1); +} + + +////////////////////////////////////////////////////// +// Functions related to the handling of dead peers. // +////////////////////////////////////////////////////// + +// Marks a peer as dead in the local rasDeadPeers array. Any propagation, reconfiguration, etc., needs to be +// handled outside of this function. +ncclResult_t rasPeerDeclareDead(const union ncclSocketAddress* addr) { + union ncclSocketAddress* deadAddr; + + if (!rasPeerIsDead(addr)) { + NCCLCHECK(getNewDeadEntry(&deadAddr)); + memcpy(deadAddr, addr, sizeof(*deadAddr)); + qsort(rasDeadPeers, nRasDeadPeers, sizeof(*rasDeadPeers), &ncclSocketsCompare); + + rasDeadPeersHash = getHash((const char*)rasDeadPeers, nRasDeadPeers*sizeof(*rasDeadPeers)); + + INFO(NCCL_RAS, "RAS declaring peer %s as DEAD; rasDeadPeersHash 0x%lx", + ncclSocketToString(addr, rasLine), rasDeadPeersHash); + } + return ncclSuccess; +} + +// Invoked when an incoming RAS_MSG_PEERSUPDATE includes info on dead peers. Updates the rasDeadPeers array. +// Any propagation needs to be handled outside of this function, though it *does* disconnect any connections +// with the newly dead peers. +// On return, nUpdatePeers contains the number of newly added dead entries. +static ncclResult_t rasDeadPeersUpdate(union ncclSocketAddress* updatePeers, int* nUpdatePeers) { + static union ncclSocketAddress* newPeers = nullptr; + static union ncclSocketAddress* oldPeers; + + if (*nUpdatePeers == 0) + return ncclSuccess; + + // Pessimistically estimate the new size of rasDeadPeers. + int nNewPeers = nRasDeadPeers + *nUpdatePeers; + if (nNewPeers > rasDeadPeersSize) { + nNewPeers = ROUNDUP(nNewPeers, RAS_INCREMENT); + + NCCLCHECK(ncclCalloc(&newPeers, nNewPeers)); + oldPeers = rasDeadPeers; + } else { + // We don't need to allocate a new array in this case. We just shift the existing content to the end of the + // array to make room in the front for merging. + oldPeers = rasDeadPeers+(rasDeadPeersSize-nRasDeadPeers); + memmove(oldPeers, rasDeadPeers, nRasDeadPeers*sizeof(*rasDeadPeers)); + newPeers = rasDeadPeers; + } + + // Merge updatePeers with oldPeers into newPeers. + int oldPeersIdx, updatePeersIdx, newPeersIdx; + for (oldPeersIdx = updatePeersIdx = newPeersIdx = 0; oldPeersIdx < nRasDeadPeers || updatePeersIdx < *nUpdatePeers;) { + int cmp; + if (oldPeersIdx < nRasDeadPeers && updatePeersIdx < *nUpdatePeers) { + cmp = ncclSocketsCompare(oldPeers+oldPeersIdx, updatePeers+updatePeersIdx); + } else { + cmp = (oldPeersIdx < nRasDeadPeers ? -1 : 1); + } + + memmove(newPeers+newPeersIdx++, (cmp <= 0 ? oldPeers+oldPeersIdx : updatePeers+updatePeersIdx), sizeof(*newPeers)); + if (cmp <= 0) + oldPeersIdx++; + if (cmp > 0) { + rasConnDisconnect(updatePeers+updatePeersIdx); + } + if (cmp >= 0) + updatePeersIdx++; + } + *nUpdatePeers = newPeersIdx - nRasDeadPeers; + nRasDeadPeers = newPeersIdx; + + if (newPeers != rasDeadPeers) { + free(rasDeadPeers); + rasDeadPeers = newPeers; + rasDeadPeersSize = nNewPeers; + } + + rasDeadPeersHash = getHash((const char*)rasDeadPeers, nRasDeadPeers*sizeof(*rasDeadPeers)); + + return ncclSuccess; +} + +// Returns the index of the first available entry in the rasDeadPeers array, enlarging the array if necessary. +static ncclResult_t getNewDeadEntry(union ncclSocketAddress** pAddr) { + if (nRasDeadPeers == rasDeadPeersSize) { + NCCLCHECK(ncclRealloc(&rasDeadPeers, rasDeadPeersSize, rasDeadPeersSize+RAS_INCREMENT)); + rasDeadPeersSize += RAS_INCREMENT; + } + + *pAddr = rasDeadPeers+(nRasDeadPeers++); + return ncclSuccess; +} + +// Checks whether a peer is dead by looking it up in the rasDeadPeers array. +bool rasPeerIsDead(const union ncclSocketAddress* addr) { + return (rasDeadPeers != nullptr && + bsearch(addr, rasDeadPeers, nRasDeadPeers, sizeof(*rasDeadPeers), ncclSocketsCompare) != nullptr); +} + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Auxiliary functions -- primarily sorting/searching callbacks, plus some debug output support. // +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Searching callback for struct rasRankInit. Compares the ncclSocketAddress key against a rasRankInit element. +static int rasAddrRankInitCompare(const void* k, const void* e) { + const union ncclSocketAddress* key = (const union ncclSocketAddress*)k; + const struct rasRankInit* elem = (const struct rasRankInit*)e; + + return ncclSocketsCompare(key, &elem->addr); +} + +// Searching callback for struct rasPeerInfo. Compares the ncclSocketAddress key against a rasPeerInfo element. +static int rasAddrPeerInfoCompare(const void* k, const void* e) { + const union ncclSocketAddress* key = (const union ncclSocketAddress*)k; + const struct rasPeerInfo* elem = (const struct rasPeerInfo*)e; + + return ncclSocketsCompare(key, &elem->addr); +} + +// Sorting callback for struct rasRankInit. addr is the primary key; cudaDev is secondary. +static int rasRanksCompare(const void* e1, const void* e2) { + const struct rasRankInit* r1 = (const struct rasRankInit*)e1; + const struct rasRankInit* r2 = (const struct rasRankInit*)e2; + int cmp = ncclSocketsCompare(&r1->addr, &r2->addr); + if (cmp == 0) { + if (r1->addr.sa.sa_family == 0) // Bail out in case of empty addresses... + return 0; + assert(r1->pid == r2->pid); + cmp = (r1->cudaDev < r2->cudaDev ? -1 : (r1->cudaDev > r2->cudaDev ? 1 : 0)); + assert(cmp != 0); // There should be no complete duplicates within the rank array. + } + return cmp; +} + +// Sorting callback for ncclSocketAddress. We want to sort by the address family (IPv4 first), then the address, +// then port. Unfortunately, that's not the order of how they are laid out in memory, so one big memcmp won't do. +// memcmp is still useful though for individual elements in the network byte order. +int ncclSocketsCompare(const void* p1, const void* p2) { + const union ncclSocketAddress* a1 = (const union ncclSocketAddress*)p1; + const union ncclSocketAddress* a2 = (const union ncclSocketAddress*)p2; + // AF_INET (2) is less than AF_INET6 (10). + int family = a1->sa.sa_family; + if (family != a2->sa.sa_family) { + if (family > 0 && a2->sa.sa_family > 0) + return (family < a2->sa.sa_family ? -1 : 1); + else // Put empty addresses at the end (not that it matters...). + return (family > 0 ? -1 : 1); + } + + int cmp; + if (family == AF_INET) { + if ((cmp = memcmp(&a1->sin.sin_addr, &a2->sin.sin_addr, sizeof(a1->sin.sin_addr))) == 0) { + cmp = memcmp(&a1->sin.sin_port, &a2->sin.sin_port, sizeof(a1->sin.sin_port)); + } + } + else if (family == AF_INET6) { + if ((cmp = memcmp(&a1->sin6.sin6_addr, &a2->sin6.sin6_addr, sizeof(a1->sin6.sin6_addr))) == 0) { + cmp = memcmp(&a1->sin6.sin6_port, &a2->sin6.sin6_port, sizeof(a1->sin6.sin6_port)); + } + } else { + // The only remaining valid case are empty addresses. + assert(family == 0); + cmp = 0; // Two empty addresses are equal... + } + + return cmp; +} + +// Returns true if two socket addresses are from the same node (actually, the same network interface on one node). +bool ncclSocketsSameNode(const union ncclSocketAddress* a1, const union ncclSocketAddress* a2) { + // AF_INET (2) is less than AF_INET6 (10). + int family = a1->sa.sa_family; + if (family != a2->sa.sa_family) + return false; + + if (family == AF_INET) + return (memcmp(&a1->sin.sin_addr, &a2->sin.sin_addr, sizeof(a1->sin.sin_addr)) == 0); + else if (family == AF_INET6) + return (memcmp(&a1->sin6.sin6_addr, &a2->sin6.sin6_addr, sizeof(a1->sin6.sin6_addr)) == 0); + else + return true; // Two empty addresses are equal... +} + +// Debug output routine: dumps the rasPeers array. +static void rasPeersDump() { + for (int p = 0; p < nRasPeers; p++) { + const struct rasPeerInfo* peer = rasPeers+p; + INFO(NCCL_RAS, "RAS peer %d: %s%s", p, rasPeerDump(peer, rasLine, sizeof(rasLine)), (p == myPeerIdx ? " [this process]" : "")); + } + if (nRasPeers > 0) + INFO(NCCL_RAS, "RAS peersHash 0x%lx", rasPeersHash); +} + +// Debug output routine: dumps the rasDeadPeers array. +static void rasDeadPeersDump() { + for (int p = 0; p < nRasDeadPeers; p++) { + int deadPeerIdx = rasPeerFind(rasDeadPeers+p); + INFO(NCCL_RAS, "RAS dead peer %d: %s", p, + (deadPeerIdx >= 0 ? rasPeerDump(rasPeers+deadPeerIdx, rasLine, sizeof(rasLine)) : + ncclSocketToString(rasDeadPeers+p, rasLine))); + } + if (nRasDeadPeers > 0) + INFO(NCCL_RAS, "RAS deadPeersHash 0x%lx", rasDeadPeersHash); +} + +// Debug output routine: dumps part of an individual element from the rasPeers array. +static char* rasPeerDump(const struct rasPeerInfo* peer, char* result, size_t nres) { + char line[SOCKET_NAME_MAXLEN+1], line2[1024]; + snprintf(result, nres, "socket %s, pid %d, GPU%s %s", ncclSocketToString(&peer->addr, line), peer->pid, + (__builtin_popcountll(peer->cudaDevs) > 1 ? "s" : ""), + rasGpuDevsToString(peer->cudaDevs, peer->nvmlDevs, line2, sizeof(line2))); + return result; +} diff --git a/src/ras/ras.cc b/src/ras/ras.cc new file mode 100644 index 000000000..4905d7a69 --- /dev/null +++ b/src/ras/ras.cc @@ -0,0 +1,668 @@ +/************************************************************************* + * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#define NDEBUG // Comment out during development only! +#include +#include +#include +#include +#include + +#include "alloc.h" +#include "checks.h" +#include "comm.h" +#include "nccl.h" +#include "utils.h" +#include "ras_internal.h" + +// Type of a notification from a local NCCL thread. +typedef enum { + RAS_ADD_RANKS = 0, + RAS_TERMINATE = 1 +} rasNotificationType; + +// Used for communication from local NCCL threads to the RAS thread. +struct rasNotification { + rasNotificationType type; + union { + struct { + struct rasRankInit* ranks; + int nranks; + } addRanks; + }; +}; +static_assert(sizeof(struct rasNotification) <= PIPE_BUF, "The rasNotification structure is too large"); + +// These ensure that we get only one RAS port/thread per process. +static std::mutex rasInitMutex; +static bool rasInitialized = false; +static int rasInitRefCount = 0; + +// The RAS network listening socket of this RAS thread (random port). +struct ncclSocket rasNetListeningSocket; + +static pthread_t rasThread; + +// Used for communication from regular NCCL threads to the RAS thread. +static std::mutex rasNotificationMutex; +static int rasNotificationPipe[2] = {-1, -1}; + +// Data for the main poll() in the RAS thread. +struct pollfd* rasPfds; +static int nRasPfds; + +// We use it all over the place; no point in wasting the stack... +char rasLine[SOCKET_NAME_MAXLEN+1]; + +// An array holding the addresses of all NCCL communicators. Modified by the NCCL threads (hence the mutex), read by +// the RAS thread. +std::mutex ncclCommsMutex; +struct ncclComm** ncclComms = nullptr; +int nNcclComms = 0; +bool ncclCommsSorted = false; // Whether the array is currently sorted. We sort by the comms' commHash and rank. + +static ncclResult_t rasLocalNotify(const struct rasNotification* msg); +static ncclResult_t rasLocalHandle(); +static void rasLocalHandleTerminate(); + +static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSocket* sock); +static ncclResult_t rasMsgHandleConnInitAck(const struct rasMsg* msg, struct rasSocket* sock); +static ncclResult_t rasNetSendNack(struct rasSocket* sock); + +static void* rasThreadMain(void*); + +NCCL_PARAM(RasTimeoutFactor, "RAS_TIMEOUT_FACTOR", 1); + +////////////////////////////////////////////////// +// Functions invoked from regular NCCL threads. // +////////////////////////////////////////////////// + +// Invoked by regular NCCL threads on every comm initialization. This is the first function to call. +// The myRank structure should be passed with the addr element initialized to the IP address of the bootstrap +// network interface to use. On a successful return, the address will be updated with the port number of the +// RAS network listening socket. +ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank) { + ncclResult_t ret = ncclSuccess; + if (!rasInitialized) { + std::lock_guard lock(rasInitMutex); + if (!rasInitialized) { + union ncclSocketAddress addr; + + memcpy(&addr, &myRank->addr, sizeof(addr)); + (addr.sa.sa_family == AF_INET ? addr.sin.sin_port : addr.sin6.sin6_port) = htons(0); + NCCLCHECKGOTO(ncclSocketInit(&rasNetListeningSocket, &addr, NCCL_SOCKET_MAGIC, ncclSocketTypeRasNetwork, + /*abortFlag*/nullptr, /*asyncFlag*/1), ret, fail); + NCCLCHECKGOTO(ncclSocketListen(&rasNetListeningSocket), ret, fail); + INFO(NCCL_RAS, "RAS network listening socket at %s", + ncclSocketToString(&rasNetListeningSocket.addr, rasLine)); + + (void)rasClientInitSocket(); + + SYSCHECKGOTO(pipe(rasNotificationPipe), "pipe", ret, fail); + + PTHREADCHECKGOTO(pthread_create(&rasThread, nullptr, &rasThreadMain, nullptr), "pthread_create", ret, fail); + ncclSetThreadName(rasThread, "NCCL RAS"); + (void)pthread_detach(rasThread); + + rasInitialized = true; + } + } + ncclAtomicRefCountIncrement(&rasInitRefCount); + + { + std::lock_guard lock(ncclCommsMutex); + + int i; + for (i = 0; i < nNcclComms; i++) { + if (ncclComms[i] == nullptr) + break; + } + if (i == nNcclComms) { + NCCLCHECK(ncclRealloc(&ncclComms, nNcclComms, nNcclComms+RAS_INCREMENT*8)); + nNcclComms += RAS_INCREMENT*8; + } + ncclComms[i] = comm; + ncclCommsSorted = false; + } + + if (myRank != nullptr) + memcpy(&myRank->addr, &rasNetListeningSocket.addr, sizeof(myRank->addr)); + +exit: + return ret; +fail: + if (rasNotificationPipe[1] != 0) + (void)close(rasNotificationPipe[1]); + if (rasNotificationPipe[0] != 0) + (void)close(rasNotificationPipe[0]); + (void)close(rasClientListeningSocket); + (void)ncclSocketClose(&rasNetListeningSocket); + goto exit; +} + +// Invoked by regular NCCL threads on every comm termination. +ncclResult_t ncclRasCommFini(const struct ncclComm* comm) { + if (!rasInitialized) + return ncclSuccess; + { + std::lock_guard lock(ncclCommsMutex); + for (int i = 0; i < nNcclComms; i++) { + if (ncclComms[i] == comm) { + ncclComms[i] = nullptr; + ncclCommsSorted = false; + break; + } + } + } + if (ncclAtomicRefCountDecrement(&rasInitRefCount) == 0) { + struct rasNotification msg; + msg.type = RAS_TERMINATE; + NCCLCHECK(rasLocalNotify(&msg)); + } + return ncclSuccess; +} + +// Invoked by regular NCCL threads on every (non-split) comm initialization. Provides info on all the ranks within +// the communicator. +ncclResult_t ncclRasAddRanks(struct rasRankInit* ranks, int nranks) { + struct rasNotification msg; + msg.type = RAS_ADD_RANKS; + msg.addRanks.ranks = ranks; + msg.addRanks.nranks = nranks; + NCCLCHECK(rasLocalNotify(&msg)); + return ncclSuccess; +} + +// Internal function running on regular NCCL threads -- asynchronously notifies the RAS thread. +static ncclResult_t rasLocalNotify(const struct rasNotification* msg) { + if (!rasInitialized) + return ncclSuccess; + + // Take an exclusive lock here to avoid multiplexing between multiple user threads (not sure if it's + // strictly required, but it won't hurt)... + std::lock_guard lock(rasNotificationMutex); + size_t done = 0; + while (done < sizeof(*msg)) { + ssize_t written; + SYSCHECK(written = write(rasNotificationPipe[1], (char*)msg + done, sizeof(*msg) - done), "write"); + done += written; + } + return ncclSuccess; +} + + +///////////////////////////////////////////////////////////////////////////////// +// Functions related to the handling of local notifications from NCCL threads. // +///////////////////////////////////////////////////////////////////////////////// + +// Handles asynchronous local notifications arriving from regular NCCL threads. +static ncclResult_t rasLocalHandle() { + struct rasNotification msg; + + size_t done = 0; + while (done < sizeof(msg)) { + ssize_t nread; + SYSCHECK(nread = read(rasNotificationPipe[0], (char*)&msg + done, sizeof(msg) - done), "read"); + if (nread == 0) // EOF + return ncclSystemError; + done += nread; + } + + if (msg.type == RAS_ADD_RANKS) { + NCCLCHECK(rasLocalHandleAddRanks(msg.addRanks.ranks, msg.addRanks.nranks)); + } else if (msg.type == RAS_TERMINATE) { + rasLocalHandleTerminate(); + } else { + WARN("RAS received unknown notification type %d", msg.type); + return ncclInternalError; + } + + return ncclSuccess; +} + +// Handles local RAS_TERMINATE notification. +static void rasLocalHandleTerminate() { + INFO(NCCL_RAS, "RAS handling local termination request"); + // For now we don't do anything. +} + + +//////////////////////////////////////////////// +// Generic functions related to RAS messages. // +//////////////////////////////////////////////// + +// Allocates a RAS message of the desired length for sending. +// Behind the scenes allocates encapsulating rasMsgMeta structure, which includes local metadata stored in front +// of the message. +// Must use rasMsgFree to free. +ncclResult_t rasMsgAlloc(struct rasMsg** msg, size_t msgLen) { + struct rasMsgMeta* meta = nullptr; + NCCLCHECK(ncclCalloc((char**)&meta, offsetof(struct rasMsgMeta, msg) + msgLen)); + *msg = &meta->msg; + // coverity[leaked_storage:FALSE] => rasMsgFree is used to free it + return ncclSuccess; +} + +// To be used only with messages allocated with rasMsgAlloc. I.e., it should be used for sent messages, not +// for received ones. +void rasMsgFree(struct rasMsg* msg) { + if (msg) { + struct rasMsgMeta* meta = (struct rasMsgMeta*)((char*)msg - offsetof(struct rasMsgMeta, msg)); + free(meta); + } +} + +// Enqueues a message for sending down a RAS connection. +void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t msgLen, bool front) { + // Get to the metadata of this message. + struct rasMsgMeta* meta = (struct rasMsgMeta*)((char*)msg - offsetof(struct rasMsgMeta, msg)); + bool ready = false; + + meta->enqueueTime = clockNano(); + meta->offset = 0; + meta->length = msgLen; + + if (front) + ncclIntruQueueEnqueueFront(&conn->sendQ, meta); + else + ncclIntruQueueEnqueue(&conn->sendQ, meta); + + if (conn->sockIdx != -1) { + struct rasSocket* sock = rasSockets+conn->sockIdx; + if (sock->status == RAS_SOCK_READY || (sock->status == RAS_SOCK_HANDSHAKE && msg->type == RAS_MSG_CONNINIT)) { + rasPfds[sock->pfd].events |= POLLOUT; + ready = true; + } + } + if (!ready) { + // It's not a bug, unless it's for things like keep-alive messages... + INFO(NCCL_RAS, "RAS enqueued message type %d on a non-ready connection with %s " + "(experiencingDelays %d, startRetryTime %.2fs, socket status %d)", + msg->type, ncclSocketToString(&conn->addr, rasLine), + conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0), + (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status)); + } +} + +// Attempts to send the queued RAS messages to another RAS thread. +ncclResult_t rasConnSendMsg(struct rasConnection* conn, int* closed, bool* allSent) { + struct ncclSocket* sock = &rasSockets[conn->sockIdx].sock; + struct rasMsgMeta* meta; + *closed = 0; + while ((meta = ncclIntruQueueHead(&conn->sendQ)) != nullptr) { + if (rasSockets[conn->sockIdx].status == RAS_SOCK_HANDSHAKE && meta->msg.type != RAS_MSG_CONNINIT) { + // We don't send anything beyond the handshake at this point. + meta = nullptr; + break; + } + if (meta->offset < sizeof(meta->length)) { + // Send the length of the message. + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, &meta->length, sizeof(meta->length), &meta->offset, closed)); + if (*closed) + return ncclSuccess; + if (meta->offset < sizeof(meta->length)) + break; + } + // Send the body of the message. + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, ((char*)&meta->msg)-sizeof(meta->length), + meta->length+sizeof(meta->length), &meta->offset, closed)); + if (*closed) + return ncclSuccess; + if (meta->offset < meta->length+sizeof(meta->length)) + break; + ncclIntruQueueDequeue(&conn->sendQ); + free(meta); + } + + *allSent = !meta; + + return ncclSuccess; +} + +// Attempts to receive a message through a RAS socket. +ncclResult_t rasMsgRecv(struct rasSocket* sock, struct rasMsg** msg, int* closed) { + *closed = 0; + if (sock->recvOffset < sizeof(sock->recvLength)) { + // Receive the length of the message. + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &sock->sock, &sock->recvLength, sizeof(sock->recvLength), + &sock->recvOffset, closed)); + if (*closed || sock->recvOffset < sizeof(sock->recvLength)) + return ncclSuccess; + NCCLCHECK(ncclCalloc((char**)&sock->recvMsg, sock->recvLength)); + } + // Receive the body of the message. + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &sock->sock, ((char*)sock->recvMsg)-sizeof(sock->recvLength), + sock->recvLength+sizeof(sock->recvLength), &sock->recvOffset, closed)); + if (*closed || sock->recvOffset < sock->recvLength+sizeof(sock->recvLength)) + return ncclSuccess; + + *msg = sock->recvMsg; + sock->recvMsg = nullptr; + sock->recvOffset = sock->recvLength = 0; + + return ncclSuccess; +} + + +////////////////////////////////////////////////////////////////// +// Functions related to the handling of specific message types. // +////////////////////////////////////////////////////////////////// + +// Invoked from the main RAS thread to dispatch incoming messages to the appropriate handler. +ncclResult_t rasMsgHandle(struct rasMsg* msg, struct rasSocket* sock) { + if (msg->type == RAS_MSG_CONNINIT) { + NCCLCHECK(rasMsgHandleConnInit(msg, sock)); + } else if (msg->type == RAS_MSG_CONNINITACK) { + NCCLCHECK(rasMsgHandleConnInitAck(msg, sock)); + } else if (msg->type == RAS_MSG_KEEPALIVE) { + NCCLCHECK(rasMsgHandleKeepAlive(msg, sock)); + } else if (msg->type == RAS_MSG_PEERSUPDATE) { + NCCLCHECK(rasMsgHandlePeersUpdate(msg, sock)); + } else if (msg->type == RAS_MSG_COLLREQ) { + NCCLCHECK(rasMsgHandleCollReq(msg, sock)); + } else if (msg->type == RAS_MSG_COLLRESP) { + NCCLCHECK(rasMsgHandleCollResp(msg, sock)); + } else { + WARN("RAS received unknown message type (%d) from %s", msg->type, ncclSocketToString(&sock->sock.addr, rasLine)); + return ncclInternalError; + } + + return ncclSuccess; +} + +// Handles the first message sent over a RAS socket as part of the handshake. +static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSocket* sock) { + ncclResult_t ret = ncclSuccess; + struct rasConnection* conn = nullptr; + int connIdx, peerIdx; + struct rasMsg* newMsg = nullptr; + int newMsgLen; + char line[SOCKET_NAME_MAXLEN+1]; + + INFO(NCCL_RAS, "RAS handling connInit from %s (version %d, listeningAddr %s, peersHash 0x%lx, deadPeersHash 0x%lx)", + ncclSocketToString(&sock->sock.addr, rasLine), msg->connInit.ncclVersion, + ncclSocketToString(&msg->connInit.listeningAddr, line), msg->connInit.peersHash, msg->connInit.deadPeersHash); + + if (msg->connInit.ncclVersion != NCCL_VERSION_CODE) { + // Close any such sockets immediately! This is basically unrecoverable... + WARN("NCCL version mismatch with remote peer %s (local: %d, remote %d)", + ncclSocketToString(&sock->sock.addr, rasLine), NCCL_VERSION_CODE, msg->connInit.ncclVersion); + rasNetSendNack(sock); + rasSocketTerminate(sock, /*finalize*/true); + ret = ncclInvalidUsage; + goto exit; + } + + if (rasPeerIsDead(&msg->connInit.listeningAddr)) { + // A peer long declared dead is suddenly alive again?! + INFO(NCCL_RAS, "RAS connection from peer %s that is considered dead!", + ncclSocketToString(&msg->connInit.listeningAddr, rasLine)); + rasNetSendNack(sock); + rasSocketTerminate(sock, /*finalize*/true); + goto exit; + } + + // Check for any existing connection with that RAS thread (could happen due to a network issue, or possibly a race). + connIdx = rasConnFind(&msg->connInit.listeningAddr); + if (connIdx != -1) { + conn = rasConns+connIdx; + + INFO(NCCL_RAS, + "RAS found a matching existing connection (sendQ %sempty, experiencingDelays %d, startRetryTime %.2fs)", + (ncclIntruQueueEmpty(&conn->sendQ) ? "" : "not "), + conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0)); + + if (conn->sockIdx != -1) { + struct rasSocket* connSock = rasSockets+conn->sockIdx; + INFO(NCCL_RAS, "RAS found an alternative existing socket (status %d, createTime %.2fs)", + connSock->status, (clockNano()-connSock->createTime)/1e9); + // In general we prefer to keep the newer connection, but "newer" can be a relative term: we may have + // a race where both sides attempt to establish a connection at roughly the same time, so the other side's + // incoming connection ends up looking newer than the locally-initiated one -- for *both* of them. + // If each side closed the "old" one, both would end up being closed. + // As we normally try to initiate connections from the side with a lower address (precisely to avoid such + // situations), we'll follow the same logic here: the "lower" side will reject the new connection (as it + // came from the "wrong" side), whereas the "higher" side will keep the new one (as it came from the correct + // side) and terminate the old one (that it presumably just opened). + if (ncclSocketsCompare(&rasNetListeningSocket.addr, &conn->addr) < 0) { + INFO(NCCL_RAS, "RAS terminating the new socket"); + rasSocketTerminate(sock, /*finalize*/true); + goto exit; + } else { + INFO(NCCL_RAS, "RAS keeping the new socket and terminating the existing one"); + rasSocketTerminate(connSock); + } + } + } + if (!conn) { + NCCLCHECK(getNewConnEntry(&conn)); + memcpy(&conn->addr, &msg->connInit.listeningAddr, sizeof(conn->addr)); + connIdx = conn - rasConns; + } + + sock->status = RAS_SOCK_READY; + // rasConnResume will reset any experiencingDelays, startRetryTime, etc. + + conn->sockIdx = sock-rasSockets; + sock->connIdx = connIdx; + memcpy(&sock->sock.addr, &msg->connInit.listeningAddr, sizeof(sock->sock.addr)); + + // Make sure that the connection is part of the right links forming the RAS network. At this point we only + // update the expected (non-external) connections; external ones will be added during keep-alive handling. + peerIdx = rasPeerFind(&conn->addr); + // Note: it's possible for peerIdx to be -1 at this point if, due to races, the connInit arrives before + // the peers update. + if (peerIdx != -1) { + (void)rasLinkUpdateConn(&rasNextLink, connIdx, peerIdx); + (void)rasLinkUpdateConn(&rasPrevLink, connIdx, peerIdx); + } + + // Send a confirmation to the server that requested the connection (so that the resilience code can mark + // the connection as live). + newMsgLen = rasMsgLength(RAS_MSG_CONNINITACK); + NCCLCHECK(rasMsgAlloc(&newMsg, newMsgLen)); + newMsg->type = RAS_MSG_CONNINITACK; + newMsg->connInitAck.nack = 0; + rasConnEnqueueMsg(conn, newMsg, newMsgLen, /*front*/true); + + conn->lastRecvPeersHash = msg->connInit.peersHash; + conn->lastRecvDeadPeersHash = msg->connInit.deadPeersHash; + + if (msg->connInit.peersHash != rasPeersHash || msg->connInit.deadPeersHash != rasDeadPeersHash) { + // Send my rasPeers and request the same in return. + INFO(NCCL_RAS, "RAS connInit hash mismatch (my peersHash 0x%lx, deadPeersHash 0x%lx); sending my (dead) peers", + rasPeersHash, rasDeadPeersHash); + NCCLCHECK(rasConnSendPeersUpdate(conn, rasPeers, nRasPeers)); + } +exit: + return ret; +} + +// Handles the second message sent over a RAS socket as part of the handshake. +static ncclResult_t rasMsgHandleConnInitAck(const struct rasMsg* msg, struct rasSocket* sock) { + INFO(NCCL_RAS, "RAS handling connInitAck from %s (nack %d)", + ncclSocketToString(&sock->sock.addr, rasLine), msg->connInitAck.nack); + + if (msg->connInitAck.nack) { + // The remote peer doesn't want to talk to us. The easiest way to prevent it is by declaring it dead. + // We make a copy of the address because rasConnDisconnect will terminate the rasSocket. + union ncclSocketAddress addr; + memcpy(&addr, &sock->sock.addr, sizeof(addr)); + rasConnDisconnect(&addr); + (void)rasPeerDeclareDead(&addr); + + return ncclSuccess; + } + + sock->status = RAS_SOCK_READY; + // rasConnResume will reset any experiencingDelays, startRetryTime, etc. + + return ncclSuccess; +} + +// Handles the deadPeer broadcast. +void rasMsgHandleBCDeadPeer(const struct rasCollRequest* req, bool* pDone) { + INFO(NCCL_RAS, "RAS handling deadPeer (addr %s)", ncclSocketToString(&req->deadPeer.addr, rasLine)); + + if (!rasPeerIsDead(&req->deadPeer.addr)) { + rasConnDisconnect(&req->deadPeer.addr); + (void)rasPeerDeclareDead(&req->deadPeer.addr); + *pDone = false; + } else { + INFO(NCCL_RAS, "RAS already knew it was dead"); + // No point in re-broadcasting what's already known. + *pDone = true; + } +} + +// Attempts to immediately send a fatal NACK connInitAck response to a socket. A bit of a hack (as it doesn't +// follow our usual message queuing and polling convention) but, since this can be invoked only for newly opened +// connections, and the message is tiny, it should be OK. We can't use the regular path because the socket is +// about to be terminated. +static ncclResult_t rasNetSendNack(struct rasSocket* sock) { + struct rasMsg msg; + int length = rasMsgLength(RAS_MSG_CONNINITACK); + int closed = 0; + int offset; + + INFO(NCCL_RAS, "RAS sending NACK to %s", ncclSocketToString(&sock->sock.addr, rasLine)); + + msg.type = RAS_MSG_CONNINITACK; + msg.connInitAck.nack = 1; + offset = 0; + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &sock->sock, &length, sizeof(length), &offset, &closed)); + if (closed || offset < sizeof(length)) + return ncclSuccess; + offset = 0; + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &sock->sock, &msg, length, &offset, &closed)); + // We are closing this socket anyway -- it doesn't matter to us if we succeeded or not. + + return ncclSuccess; +} + + +///////////////////////////////////////////////////////////////// +// Functions related to the main event loop of the RAS thread. // +///////////////////////////////////////////////////////////////// + +// Main function of the RAS thread. +static void* rasThreadMain(void*) { + ncclResult_t ret = ncclSuccess; // Unused. + int pfd; + int rasNetListeningSocketFd; + + INFO(NCCL_RAS, "RAS thread started"); + + // Initialize the global pollfd with the file descriptors we already have (the pipe and the listening socket). + NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail); + rasPfds[pfd].fd = rasNotificationPipe[0]; + rasPfds[pfd].events = POLLIN; + + NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail); + NCCLCHECKGOTO(ncclSocketGetFd(&rasNetListeningSocket, &rasNetListeningSocketFd), ret, fail); + rasPfds[pfd].fd = rasNetListeningSocketFd; + rasPfds[pfd].events = POLLIN; + + NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail); + rasPfds[pfd].fd = rasClientListeningSocket; + rasPfds[pfd].events = POLLIN; + + // Main event loop of the RAS thread. + for (int64_t nextWakeup=0;;) { + int timeout, nEvents; + int64_t now = clockNano(); + if (nextWakeup > 0) { + // The "1" below helps avoid round-downs and especially zeroes. + if (nextWakeup > now) + timeout = (nextWakeup - now) / (CLOCK_UNITS_PER_SEC / 1000) + 1; + else + timeout = 1; + } else { + timeout = 1000; // 1 second. + } + + nEvents = poll(rasPfds, nRasPfds, timeout); + + nextWakeup = clockNano()+CLOCK_UNITS_PER_SEC; + if (nEvents == -1 && errno != EINTR) + INFO(NCCL_RAS, "RAS continuing in spite of an unexpected error from poll: %s", strerror(errno)); + + // Handle any poll-related events. + for (int pollIdx = 0; pollIdx < nRasPfds && nEvents > 0; pollIdx++) { + if (rasPfds[pollIdx].revents) { + nEvents--; + if (rasPfds[pollIdx].fd == rasNotificationPipe[0]) { + (void)rasLocalHandle(); + } else if (rasPfds[pollIdx].fd == rasNetListeningSocketFd) { + (void)rasNetAcceptNewSocket(); + } else if (rasPfds[pollIdx].fd == rasClientListeningSocket) { + (void)rasClientAcceptNewSocket(); + } else { + // Check if it's one of the RAS sockets. + int sockIdx; + for (sockIdx = 0; sockIdx < nRasSockets; sockIdx++) { + struct rasSocket* sock = rasSockets+sockIdx; + if (sock->status != RAS_SOCK_CLOSED && rasPfds[pollIdx].fd == sock->sock.fd) { + rasSockEventLoop(sockIdx, pollIdx); + break; + } + } // for (sockIdx) + + if (sockIdx == nRasSockets) { + // Try a client socket instead. + for (int clientIdx = 0; clientIdx < nRasClients; clientIdx++) { + struct rasClient* client = rasClients+clientIdx; + if (client->status != RAS_CLIENT_CLOSED && rasPfds[pollIdx].fd == client->sock) { + rasClientEventLoop(clientIdx, pollIdx); + break; + } + } // for (clientIdx) + } // if (sockIdx == nRasSockets) + } // dynamic fds + } // if (revents) + } // for (pollIdx) + + now = clockNano(); + + rasSocksHandleTimeouts(now, &nextWakeup); + + rasConnsHandleTimeouts(now, &nextWakeup); + + rasNetHandleTimeouts(now, &nextWakeup); + + rasCollsHandleTimeouts(now, &nextWakeup); + } // for (;;) + +fail: + WARN("fatal error - RAS thread terminating"); + std::lock_guard lock(rasInitMutex); + (void)close(rasNotificationPipe[1]); + (void)close(rasNotificationPipe[0]); + (void)close(rasClientListeningSocket); + (void)ncclSocketClose(&rasNetListeningSocket); + rasInitialized = false; + return nullptr; +} + +// Returns the index of the first available entry in the rasPfds array, enlarging the array if necessary. +ncclResult_t rasGetNewPollEntry(int* index) { + int i; + for (i = 0; i < nRasPfds; i++) + if (rasPfds[i].fd == -1) + break; + if (i == nRasPfds) { + NCCLCHECK(ncclRealloc(&rasPfds, nRasPfds, nRasPfds+RAS_INCREMENT)); + nRasPfds += RAS_INCREMENT; + for (int j = i; j < nRasPfds; j++) + rasPfds[j].fd = -1; + } + + memset(rasPfds+i, '\0', sizeof(*rasPfds)); + rasPfds[i].fd = -1; + + *index = i; + return ncclSuccess; +} diff --git a/src/ras/ras_internal.h b/src/ras/ras_internal.h new file mode 100644 index 000000000..68cac0b44 --- /dev/null +++ b/src/ras/ras_internal.h @@ -0,0 +1,512 @@ +/************************************************************************* + * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_RAS_INTERNAL_H_ +#define NCCL_RAS_INTERNAL_H_ + +#define NCCL_RAS_CLIENT_PORT 28028 +#define NCCL_RAS_CLIENT_PROTOCOL 2 + +#define RAS_COLLECTIVE_LEG_TIMEOUT_SEC 5 +#define RAS_COLLECTIVE_EXTRA_TIMEOUT_SEC RAS_COLLECTIVE_LEG_TIMEOUT_SEC + +// End of the client section; everything below is meant for the NCCL threads only. +#ifndef NCCL_RAS_CLIENT + +#include + +#include "nccl.h" +#include "ras.h" +#include "socket.h" +#include "utils.h" + +// Type of a RAS network or client message. +typedef enum { + RAS_MSG_CONNINIT = 1, + RAS_MSG_CONNINITACK = 2, + RAS_MSG_KEEPALIVE = 3, + RAS_MSG_PEERSUPDATE = 4, + RAS_MSG_COLLREQ = 5, + RAS_MSG_COLLRESP = 6, +} rasMsgType; + +// Type of a RAS network collective message. +typedef enum { + RAS_MSG_NONE = 0, + RAS_BC_DEADPEER = 1, + // Broadcast operations above this line; collective operations below (1000 is the demarcation line). + RAS_COLL_CONNS = 1001, // Collect data about all RAS connections. + RAS_COLL_COMMS = 1002, // Collect data about all communicators. +} rasCollectiveType; + +// Payload of a collective request message (RAS_MSG_COLLREQ). +struct rasCollRequest { + union ncclSocketAddress rootAddr; + uint64_t rootId; + + int64_t timeout; + rasCollectiveType type; + union { + struct { + union ncclSocketAddress addr; + } deadPeer; + struct { + } conns; + struct { + } comms; + }; +}; + +// Payload of a collective response message (RAS_MSG_COLLRESP). +struct rasCollResponse { + union ncclSocketAddress rootAddr; + uint64_t rootId; + + int nLegTimeouts; // If >0, indicates incomplete data. + int nPeers; + int nData; // Size of data in bytes. + union ncclSocketAddress peers[0]; // Variable length. + // The peersAddrs array is followed by: + //alignas(int64_t) char data[0]; // Variable length, collective-dependent. +}; + +// Describes a peer NCCL process. Every RAS thread keeps an (identical) array of them, one entry for each +// NCCL process. +struct rasPeerInfo { + union ncclSocketAddress addr; + pid_t pid; + uint64_t cudaDevs; // Bitmask. Conveniently, NCCL_MAX_LOCAL_RANKS == 64. + uint64_t nvmlDevs; // Same, but not affected by CUDA_VISIBLE_DEVICES. +}; + +// Describes a RAS message. Every message is preceded by a (32-bit) message length. All data in the host +// byte order. Depending on the message type, the length of the message will vary. +struct rasMsg { + rasMsgType type; + union { + struct { + int ncclVersion; + union ncclSocketAddress listeningAddr; + uint64_t peersHash; + uint64_t deadPeersHash; + } connInit; // Sent by the connecting side as the first message. + struct { + int nack; // If non-0, we should stop trying to reconnect. + } connInitAck; // Response from the accepting side to the above. + struct { + uint64_t peersHash; + uint64_t deadPeersHash; + int linkMask; // What links at the destination peer should the connection be part of + // (bit 0: nextLink; bit 1: prevLink). + struct timespec realTime; // Wallclock time at the source, for statistical purposes (in principle there's + // no guarantee that the nodes have synchronized clocks so we can't really rely + // on it for anything important).. + int nack; // If non-0, it means that this message is a response to an unexpected keepAlive message. + } keepAlive; + struct { + uint64_t peersHash; + uint64_t deadPeersHash; + int nPeers; + int nDeadPeers; + struct rasPeerInfo peers[0]; // Variable length. + // The peers array is followed by the following: + //union ncclSocketAddress deadPeers[0]; // Variable length. + } peersUpdate; + struct { + int protocol; // Protocol version, sent to the client. + } clientInit; + struct { + int nData; + char data[0]; // Variable length. + } clientDump; + struct rasCollRequest collReq; // Variable length. + struct rasCollResponse collResp; // Variable length. + }; +}; + +// Returns the size of the collective portion of a collective request message. +static inline size_t rasCollDataLength(rasCollectiveType type) { + struct rasCollRequest* data; + switch (type) { + case RAS_BC_DEADPEER: + return offsetof(struct rasCollRequest, deadPeer) + sizeof(data->deadPeer); + case RAS_COLL_CONNS: + return offsetof(struct rasCollRequest, conns) + sizeof(data->conns); + case RAS_COLL_COMMS: + return offsetof(struct rasCollRequest, comms) + sizeof(data->comms); + case RAS_MSG_NONE: + return 0; + }; + return 0; +} + +// Returns the size for a message of a particular type. +static inline size_t rasMsgLength(rasMsgType type, rasCollectiveType collType = RAS_MSG_NONE) { + struct rasMsg* msg; + switch (type) { + case RAS_MSG_CONNINIT: + return offsetof(struct rasMsg, connInit) + sizeof(msg->connInit); + case RAS_MSG_CONNINITACK: + return offsetof(struct rasMsg, connInitAck) + sizeof(msg->connInitAck); + case RAS_MSG_KEEPALIVE: + return offsetof(struct rasMsg, keepAlive) + sizeof(msg->keepAlive); + case RAS_MSG_PEERSUPDATE: + return offsetof(struct rasMsg, peersUpdate) + sizeof(msg->peersUpdate); + case RAS_MSG_COLLREQ: + return offsetof(struct rasMsg, collReq) + rasCollDataLength(collType); + case RAS_MSG_COLLRESP: + return offsetof(struct rasMsg, collResp) + sizeof(msg->collResp); + }; + return 0; +} + +// How much to enlarge any RAS array by if we run out of space. +#define RAS_INCREMENT 4 + +// Our clock has nanosecond resolution. +#define CLOCK_UNITS_PER_SEC 1000000000L + +// Keep-alive messages are sent no sooner than a second after the last message was sent down a particular connection. +#define RAS_KEEPALIVE_INTERVAL (1*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor()) + +// If no message arrives in 5 seconds via a particular connection that uses keep-alive messages, generate a warning +// and try alternative connections. +#define RAS_KEEPALIVE_TIMEOUT_WARN (5*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor()) + +// Abort a socket that uses keep-alive messages if no message arrives in 20 seconds. +// We will try to re-establish communication via that connection (until RAS_PEER_DEAD_TIMEOUT). +#define RAS_KEEPALIVE_TIMEOUT_ERROR RAS_STUCK_TIMEOUT + +// Retry connecting on failing sockets (ECONNREFUSED, etc.) once a second. +#define RAS_CONNECT_RETRY (1*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor()) + +// If we can't connect in 5 seconds, we generate a warning and try alternative connections. +#define RAS_CONNECT_WARN RAS_KEEPALIVE_TIMEOUT_WARN + +// Abort a busy socket (one we are trying to send on, or one that was being established) if there's been +// no sign of progress in 20 second. We will try to re-establish communication (up to RAS_PEER_DEAD_TIMEOUT). +#define RAS_STUCK_TIMEOUT (20*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor()) + +// Terminate ad-hoc connections that have not been used in 60 seconds. +#define RAS_IDLE_TIMEOUT (60*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor()) + +// If the socket is closed by peer within 5 seconds from the idle timeout, do not attempt to re-establish. +#define RAS_IDLE_GRACE_PERIOD (5*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor()) + +// Declare a peer as dead and don't retry communicating with it if we couldn't reach it for 60 seconds. +#define RAS_PEER_DEAD_TIMEOUT (60*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor()) + +// Abort a leg of a collective operation if the response takes more than 5 seconds to arrive *and* one of the +// connections experiences delays. +#define RAS_COLLECTIVE_LEG_TIMEOUT (RAS_COLLECTIVE_LEG_TIMEOUT_SEC*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor()) + +// Abort a whole collective operation after at most RAS_COLLECTIVE_LEG_TIMEOUT+RAS_COLLECTIVE_EXTRA_TIMEOUT (10s). +#define RAS_COLLECTIVE_EXTRA_TIMEOUT (RAS_COLLECTIVE_EXTRA_TIMEOUT_SEC*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor()) + +// Structure used for tracking the progress of sending a RAS message. +struct rasMsgMeta { + struct rasMsgMeta* next; + int64_t enqueueTime; + int offset; // Progress sending the message (including the message size itself (an int, which is sent first)). + int length; // Length of the message (*excluding* the message size). + struct rasMsg msg; // Variable length. +}; + +// Describes an ongoing collective RAS operation (apart from broadcasts, which don't need a response). +// For every collective operation, each participating RAS thread will create its own. +struct rasCollective { + union ncclSocketAddress rootAddr; + uint64_t rootId; + + rasCollectiveType type; + + int64_t timeout; + bool timeoutWarned; + + int64_t startTime; // For timeout calculations. + int fromConnIdx; // The connection we received the request from. + + int* fwdConns; // Indices of the connections we forwarded the request to; replaced by -1 as the responses arrive. + int nFwdSent; // Count of the above (local process only). + int nFwdRecv; // Count of the responses received or timeouts (local process only). + + int nLegTimeouts; // Collective (from this process and the responses we received). + + union ncclSocketAddress* peers; // Collective (from this process and the responses we received). + int nPeers; + + char* data; // Collective (from this process and the responses we received). + int nData; +}; + +// Collective data in RAS_COLL_CONNS responses. +struct rasCollConns { + int64_t travelTimeMin; + int64_t travelTimeMax; + int64_t travelTimeSum; + int64_t travelTimeCount; + int nConns; + int nNegativeMins; + struct negativeMin { + union ncclSocketAddress source; + union ncclSocketAddress dest; + int64_t travelTimeMin; + } negativeMins[0]; // Variable length. +}; + +// Collective data in RAS_COLL_COMMS responses. +struct rasCollComms { + int nComms; + struct comm { + uint64_t commHash; + int commNRanks; + int nRanks; // number of elements in the array below, *not* in the communicator. + struct rank { + int commRank; + int peerIdx; // Index within rasCollective->peers, *not* rasPeers. + uint64_t collOpCount; + struct { + ncclResult_t initState:4; + ncclResult_t asyncError:4; + bool finalizeCalled:1; + bool destroyFlag:1; + bool abortFlag:1; + } status; + char cudaDev; + char nvmlDev; + } ranks[0]; // Variable length. Sorted by commRank. Optimized for 1 GPU/process. + } comms[0]; // Variable length. Sorted by commHash. +}; + +// Holds data needed to keep track of a connection belonging to a RAS network link (either the primary one +// or one of the fallbacks). +struct rasLinkConn { + int peerIdx; // Index in the rasPeers array of the peer this entry describes. Could be -1 (an entry initiated + // by an as of yet unknown peer -- should be a temporary situation that resolves via peer updates). + int connIdx; // Index in the rasConns array of the connection to the above peer. Could be -1 (a placeholder + // for a connection to be started by the remote peer). + bool external; // true if the entry exists only due to an external request (requested by a remote peer, most + // likely as part of fault recovery). Such connections are kept as fallbacks even if there's a + // valid primary connection, in order to ensure that keep-alive messages are sent. +}; + +// Describes a link that forms the backbone of the RAS network. Links focus on direction (previous/next in +// case of 1-D topology) rather than a particular destination. The are implemented using rasConnections, but +// they are persistent through the life of the RAS threads, whereas rasConnections can be terminated if the RAS +// network is reconfigured or a peer dies. +struct rasLink { + int direction; // 1 for nextLink, -1 for prevLink. + + // Index 0 is the primary connection; any additional ones are fallbacks (that get created if we are having + // problems with the primary connection). The elements are de-facto ordered (highest-preference ones have + // the lowest indices). + struct rasLinkConn* conns; + int nConns; + int connsSize; // Array size; could be larger than nConns. + + // Keep track of a timeout in case we did not create a connection during the last peers update (because we expect + // the peer on the other side to do so) but that peer failed to initiate. + int64_t lastUpdatePeersTime; +}; + +// Describes a connection to another peer on the RAS network. It is meant to be more persistent than a volatile +// socket (described by the rasSocket structure), which can be affected by transient network issues. +struct rasConnection { + bool inUse; + + union ncclSocketAddress addr; + + // Index of the current rasSocket in the rasSockets array. Note that multiple rasSocket entries may point back + // to a single entry here, for sockets that are in the process of being terminated and re-established. + // We use indices, not pointers, because the arrays holding these structures can be re-alloced at run time. + // -1 if there is no such socket. + int sockIdx; + + // We keep the rasPeersHash of remote connections to minimize the number of needless exchanges. + // There is a subtle difference in the meaning of lastSentPeersHash and lastRecvPeersHash. + // lastSentPeersHash stores *our* rasPeersHash from the time we last sent a peers *update* through this connection + // (which is different than sending just the hash, like we do in KEEPALIVE, etc.). + // lastRecvPeersHash stores the latest known rasPeersHash of the peer (received via KEEPALIVE, etc.). + uint64_t lastSentPeersHash; + uint64_t lastRecvPeersHash; + + // Same but for rasDeadPeersHash. + uint64_t lastSentDeadPeersHash; + uint64_t lastRecvDeadPeersHash; + + // Queue of messages to send. + struct ncclIntruQueue sendQ; + + // Used for keeping track of timeouts that may extend beyond the lifetime of a socket. + // The timeout starts when the connection is being created (and is turned off when the initialization is completed + // successfully) or when we detect a problem, such as a socket timeout (in the latter case, we may need to + // retroactively calculate the start time). + // A value of 0 indicates that they are not currently in use. + int64_t startRetryTime; + int64_t lastRetryTime; + + bool experiencingDelays; // A flag indicating that the connection is currently subject to RAS_KEEPALIVE_TIMEOUT_WARN + // or RAS_CONNECT_WARN timeout. If set, the warnings have been issued and the fallbacks + // have been initiated if needed. + bool linkFlag; // Used within rasNet* calls to mark whether this connection was already handled when iterating over + // multiple links (since a connection can belong to more than one link). + // The below four fields are for statistical purposes only. + int64_t travelTimeMin; + int64_t travelTimeMax; + int64_t travelTimeSum; + int64_t travelTimeCount; +}; + +// Status of a RAS socket. +typedef enum { + RAS_SOCK_CLOSED = 0, + RAS_SOCK_CONNECTING = 1, + RAS_SOCK_HANDSHAKE = 2, + RAS_SOCK_READY = 3, + RAS_SOCK_TERMINATING = 4 +} rasSocketStatus; + +// Describes a socket implementing communication between two peers. +struct rasSocket { + struct ncclSocket sock; + + rasSocketStatus status; + + int pfd; // Index in the rasPfds array. + + // Index of the corresponding entry in the rasConns array. + // We use indices, not pointers, because the arrays holding these structures can be re-alloced at run time. + // -1 if there is no connection (normal condition on the accept side before the connInit message). + int connIdx; + + int64_t createTime; + int64_t lastSendTime; + int64_t lastRecvTime; + + // Data on the message currently being received. + int recvOffset; + int recvLength; + struct rasMsg* recvMsg; +}; + +// Status of a RAS client. +typedef enum { + RAS_CLIENT_CLOSED = 0, + RAS_CLIENT_CONNECTED = 1, + RAS_CLIENT_INIT = 2, + RAS_CLIENT_CONNS = 3, + RAS_CLIENT_COMMS = 4, + RAS_CLIENT_FINISHED = 99 +} rasClientStatus; + +// Describes a RAS client. +struct rasClient { + int sock; + + rasClientStatus status; + + int pfd; // Index in the rasPfds array. + + char recvBuffer[1024]; + int recvOffset; + + // Queue of messages to send. + struct ncclIntruQueue sendQ; + + int verbose; + int64_t timeout; + + // State stored during asynchronous operations such as collectives. + int collIdx; // Index to the onging rasCollective. +}; + + +// ras.cc +extern struct pollfd* rasPfds; +extern struct ncclSocket rasNetListeningSocket; +extern std::mutex ncclCommsMutex; +extern struct ncclComm** ncclComms; +extern int nNcclComms; +extern bool ncclCommsSorted; +extern char rasLine[SOCKET_NAME_MAXLEN+1]; + +int64_t ncclParamRasTimeoutFactor(); +ncclResult_t rasMsgAlloc(struct rasMsg** msg, size_t msgLen); +void rasMsgFree(struct rasMsg* msg); +void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t msgLen, bool front = false); +ncclResult_t rasConnSendMsg(struct rasConnection* conn, int* closed, bool* allSent); +ncclResult_t rasMsgRecv(struct rasSocket* sock, struct rasMsg** msg, int* closed); +ncclResult_t rasMsgHandle(struct rasMsg* msg, struct rasSocket* sock); +void rasMsgHandleBCDeadPeer(const struct rasCollRequest* req, bool* pDone); +ncclResult_t rasGetNewPollEntry(int* index); + + +// rasnet.cc +extern struct rasLink rasNextLink, rasPrevLink; +extern struct rasConnection* rasConns; +extern int nRasConns; +extern struct rasSocket *rasSockets; +extern int nRasSockets; + +ncclResult_t getNewConnEntry(struct rasConnection** pConn); +ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, int* pConnIdx); +int rasConnFind(const union ncclSocketAddress* addr); +void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup); +void rasConnDisconnect(const union ncclSocketAddress* addr); +ncclResult_t rasNetAcceptNewSocket(); +void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup); +void rasSocketTerminate(struct rasSocket* sock, bool finalize = false, uint64_t startRetryOffset = 0, + bool retry = true); +void rasSockEventLoop(int sockIdx, int pollIdx); +void rasNetHandleTimeouts(int64_t now, int64_t* nextWakeup); +ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* sock); +ncclResult_t rasLinkUpdateConn(struct rasLink* link, int connIdx, int peerIdx, bool external = false, + bool insert = false, bool pretend = false, int* pLinkIdx = nullptr); + +// peers.cc +extern struct rasPeerInfo* rasPeers; +extern int nRasPeers; +extern uint64_t rasPeersHash; +extern union ncclSocketAddress* rasDeadPeers; +extern int nRasDeadPeers; +extern uint64_t rasDeadPeersHash; + +ncclResult_t rasLocalHandleAddRanks(struct rasRankInit* ranks, int nranks); +int rasPeerFind(const union ncclSocketAddress* addr); +ncclResult_t rasConnSendPeersUpdate(struct rasConnection* conn, const struct rasPeerInfo* peers, int nPeers); +ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock); +int rasLinkCalculatePeer(const struct rasLink* link, int peerIdx, bool isFallback = false); +ncclResult_t rasPeerDeclareDead(const union ncclSocketAddress* addr); +bool rasPeerIsDead(const union ncclSocketAddress* addr); +int ncclSocketsCompare(const void* p1, const void* p2); +bool ncclSocketsSameNode(const union ncclSocketAddress* a1, const union ncclSocketAddress* a2); + + +// collectives.cc +extern struct rasCollective* rasCollectives; + +void rasCollReqInit(struct rasCollRequest* req); +ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, bool* pAllDone = nullptr, + int* pCollIdx = nullptr, int fromConnIdx = -1); +ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock); +ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock); +void rasCollsPurgeConn(int connIdx); +void rasCollFree(struct rasCollective* coll); +void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup); + +// client_support.cc +extern int rasClientListeningSocket; +extern struct rasClient* rasClients; +extern int nRasClients; +ncclResult_t rasClientInitSocket(); +ncclResult_t rasClientAcceptNewSocket(); +ncclResult_t rasClientResume(struct rasCollective* coll); +void rasClientEventLoop(int clientIdx, int pollIdx); +const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, size_t size); + +#endif // !NCCL_RAS_CLIENT + +#endif // !NCCL_RAS_INTERNAL_H_ diff --git a/src/ras/rasnet.cc b/src/ras/rasnet.cc new file mode 100644 index 000000000..441ad192c --- /dev/null +++ b/src/ras/rasnet.cc @@ -0,0 +1,1189 @@ +/************************************************************************* + * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#define NDEBUG // Comment out during development only! +#include + +#include "ras_internal.h" + +// Links forming the backbone of the RAS network (currently a ring). +struct rasLink rasNextLink = {1}, rasPrevLink = {-1}; + +// Connections on the RAS network. +struct rasConnection* rasConns; +int nRasConns; + +// Sockets implementing the RAS network. +struct rasSocket *rasSockets; +int nRasSockets; + +// Magic file descriptor number when we want poll() to ignore an entry. Anything negative would do, but +// I didn't want to use -1 because it has a special meaning for us. +#define POLL_FD_IGNORE -2 + +static void rasConnOpen(struct rasConnection* conn); +static ncclResult_t rasConnPrepare(struct rasConnection* conn); +static void rasConnTerminate(struct rasConnection* conn); + +static ncclResult_t getNewSockEntry(struct rasSocket** pSock); + +static ncclResult_t rasLinkHandleNetTimeouts(struct rasLink* link, int64_t now, int64_t* nextWakeup); +static void rasConnHandleNetTimeouts(int connIdx, int64_t now, int64_t* nextWakeup); +static void rasConnSendKeepAlive(struct rasConnection* conn, bool nack = false); + +static ncclResult_t rasLinkAddFallback(struct rasLink* link, int connIdx); +static void rasConnResume(struct rasConnection* conn); +static void rasLinkSanitizeFallbacks(struct rasLink* link); +static void rasLinkDropConn(struct rasLink* link, int connIdx, int linkIdx = -1); +static int rasLinkFindConn(const struct rasLink* link, int connIdx); + + +/////////////////////////////////////////////// +// Functions related to the RAS connections. // +/////////////////////////////////////////////// + +// Allocates an entry in the rasConns array, enlarging the array if necessary. +ncclResult_t getNewConnEntry(struct rasConnection** pConn) { + struct rasConnection* conn; + int i; + for (i = 0; i < nRasConns; i++) + if (!rasConns[i].inUse) + break; + if (i == nRasConns) { + NCCLCHECK(ncclRealloc(&rasConns, nRasConns, nRasConns+RAS_INCREMENT)); + nRasConns += RAS_INCREMENT; + } + + conn = rasConns+i; + memset(conn, '\0', sizeof(*conn)); + conn->inUse = true; + conn->sockIdx = -1; + ncclIntruQueueConstruct(&conn->sendQ); + conn->travelTimeMin = INT64_MAX; + conn->travelTimeMax = INT64_MIN; + + *pConn = conn; + return ncclSuccess; +} + +// Creates a new RAS network connection to a remote peer address. +ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, int* pConnIdx) { + ncclResult_t ret = ncclSuccess; + struct rasConnection* conn = nullptr; + + // First check if a connection entry for this peer already exists. + int connIdx = rasConnFind(addr); + if (connIdx != -1) { + conn = rasConns+connIdx; + } + + if (conn && conn->sockIdx != -1) { + // An entry exists and has a socket associated with it -- nothing left for us to do. + if (pConnIdx) + *pConnIdx = connIdx; + goto exit; + } + + if (!conn) { + NCCLCHECKGOTO(getNewConnEntry(&conn), ret, exit); + memcpy(&conn->addr, addr, sizeof(conn->addr)); + // We are establishing a new connection -- start the timeout. + conn->startRetryTime = clockNano(); + connIdx = conn - rasConns; + } + + if (pConnIdx) + *pConnIdx = connIdx; + + rasConnOpen(conn); + +exit: + return ret; +} + +// Opens a connection to a remote peer. +static void rasConnOpen(struct rasConnection* conn) { + ncclResult_t ret; // Not used. + struct rasSocket* sock; + bool closeSocketOnFail = false; + int ready; + + NCCLCHECKGOTO(getNewSockEntry(&sock), ret, fail); + NCCLCHECKGOTO(ncclSocketInit(&sock->sock, &conn->addr, NCCL_SOCKET_MAGIC, ncclSocketTypeRasNetwork, nullptr, + /*asyncFlag*/1, /*customRetry*/1), ret, fail); + closeSocketOnFail = true; + NCCLCHECKGOTO(ncclSocketConnect(&sock->sock), ret, fail); + NCCLCHECKGOTO(ncclSocketReady(&sock->sock, &ready), ret, fail); + + NCCLCHECKGOTO(rasGetNewPollEntry(&sock->pfd), ret, fail); + + // We delay the initialization of sockIdx, connIdx and status until this point so that in case of failures + // we don't need to clean them up. + conn->sockIdx = sock-rasSockets; + sock->connIdx = conn-rasConns; + rasPfds[sock->pfd].fd = sock->sock.fd; + + // We ignore the possibly ready status of the socket at this point and consider it CONNECTING because + // there are other things we want to do before sending the CONNINIT, such as adding the connection to + // the network links, etc. + sock->status = RAS_SOCK_CONNECTING; + rasPfds[sock->pfd].events = (POLLIN | POLLOUT); + if (sock->sock.state == ncclSocketStateConnecting) + rasPfds[sock->pfd].fd = POLL_FD_IGNORE; // Don't poll on this socket before connect(). + +exit: + conn->lastRetryTime = clockNano(); + // We deliberately ignore ret as this function will be retried later if needed. + return; +fail: + if (closeSocketOnFail) + (void)ncclSocketClose(&sock->sock); + goto exit; +} + +// Sends an initial RAS message to the peer after connecting to it. +static ncclResult_t rasConnPrepare(struct rasConnection* conn) { + struct rasMsg* msg = nullptr; + int msgLen = rasMsgLength(RAS_MSG_CONNINIT); + + // The first message the RAS threads exchange provides the listening address of the connecting thread + // and the NCCL version to ensure that users aren't mixing things up. + NCCLCHECK(rasMsgAlloc(&msg, msgLen)); + msg->type = RAS_MSG_CONNINIT; + msg->connInit.ncclVersion = NCCL_VERSION_CODE; + memcpy(&msg->connInit.listeningAddr, &rasNetListeningSocket.addr, sizeof(msg->connInit.listeningAddr)); + msg->connInit.peersHash = rasPeersHash; + msg->connInit.deadPeersHash = rasDeadPeersHash; + // We don't update lastSent[Dead]PeersHash because we aren't actually sending the peers themselves here. + + rasConnEnqueueMsg(conn, msg, msgLen, /*front*/true); + + // We'll finish the initialization in rasMsgHandleConnInitAck, after the other side responds. + return ncclSuccess; +} + +// Searches through rasConns for a connection with a provided address. +int rasConnFind(const union ncclSocketAddress* addr) { + // rasConns is not sorted (given the number of indices, it would be a massive hassle to keep it that way) + // so binary search won't do... + for (int i = 0; i < nRasConns; i++) { + struct rasConnection* conn = rasConns+i; + if (conn->inUse && memcmp(&conn->addr, addr, sizeof(conn->addr)) == 0) + return i; + } + + return -1; +} + +// Handles any connection-related timeouts. Many timeouts affect the underlying sockets and thus have been handled +// in the socket timeout handler earlier by terminating the problematic sockets. If a socket connection doesn't +// exist or needs to be re-established (due to having just been terminated), we handle that here. +// This is also where we declare peers as dead, etc. +// Invoked from the main RAS event loop. +void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup) { + for (int connIdx = 0; connIdx < nRasConns; connIdx++) { + struct rasConnection* conn = rasConns+connIdx; + + if (!conn->inUse) + continue; + + if (conn->sockIdx != -1) { + struct rasSocket* sock = rasSockets+conn->sockIdx; + bool sockTerminated = false; + + // Retry the socket connections that have been refused. + if (sock->status == RAS_SOCK_CONNECTING && sock->sock.state == ncclSocketStateConnecting) { + if (now - sock->lastSendTime > RAS_CONNECT_RETRY) { + int ready; + if (ncclSocketReady(&sock->sock, &ready) != ncclSuccess) { + INFO(NCCL_RAS, "Unexpected error from ncclSocketReady; terminating the socket connection with %s", + ncclSocketToString(&sock->sock.addr, rasLine)); + rasSocketTerminate(sock, /*finalize*/true); + // We will retry below in the same loop. + sockTerminated = true; + } else { + // We update lastSendTime even if !ready because we need it up-to-date for timeout calculations. + sock->lastSendTime = clockNano(); + if (!ready && sock->sock.state == ncclSocketStateConnecting) + *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_CONNECT_RETRY); + else + rasPfds[sock->pfd].fd = sock->sock.fd; // Enable the handling via the main loop. + } // if (ncclSocketReady) + } else { + *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_CONNECT_RETRY); + } + } // if (sock->status == RAS_SOCK_CONNECTING && sock->sock.state == ncclSocketStateConnecting) + + // For connections that have data to send but that we've been unable to send a message on for a while, + // consider their sockets lost and terminate them. + if (!sockTerminated && !ncclIntruQueueEmpty(&conn->sendQ) && sock->status == RAS_SOCK_READY) { + if (now - std::max(sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime) > RAS_STUCK_TIMEOUT) { + INFO(NCCL_RAS, "RAS send stuck timeout error (%lds) on socket connection with %s", + (now - std::max(sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime)) / + CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine)); + rasSocketTerminate(sock, /*finalize*/false, RAS_STUCK_TIMEOUT); + // We will retry below in the same loop. + } else { + *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime, + ncclIntruQueueHead(&conn->sendQ)->enqueueTime)+RAS_STUCK_TIMEOUT); + } + } // if (!ncclIntruQueueEmpty(&conn->sendQ) && sock->status == RAS_SOCK_READY) + } // if (conn->sockIdx != -1) + + // For connections that are being (re-)established, irrespective of whether there's a valid socket associated + // with them (conn->startIdx != -1), we need to check if any connection-level timeout has expired. + if (conn->startRetryTime) { + // If we've been trying to open a connection for too long (60s), give up and mark the peer as dead + // so that we don't try again. + if (now - conn->startRetryTime > RAS_PEER_DEAD_TIMEOUT) { + struct rasCollRequest bCast; + INFO(NCCL_RAS, "RAS connect retry timeout (%lds) on socket connection with %s", + (now-conn->startRetryTime)/CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine)); + + // Broadcast the info about a dead peer to everybody. This will handle it locally as well, including + // declaring the peer dead and terminating the connection. + rasCollReqInit(&bCast); + bCast.type = RAS_BC_DEADPEER; + memcpy(&bCast.deadPeer.addr, &conn->addr, sizeof(bCast.deadPeer.addr)); + (void)rasNetSendCollReq(&bCast, rasCollDataLength(RAS_BC_DEADPEER)); + + continue; + } else { + *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_PEER_DEAD_TIMEOUT); + } + + // RAS_STUCK_TIMEOUT has already been handled in the socket function (we'll pick it up later via + // the conn->sockIdx == -1 test). + + // We print warnings after the same time as with keep-alive (5s), and we pessimistically immediately try + // to establish fallback connections. + if (now - conn->startRetryTime > RAS_CONNECT_WARN) { + if (!conn->experiencingDelays) { + INFO(NCCL_RAS, "RAS connect timeout warning (%lds) on socket connection with %s", + (now-conn->startRetryTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine)); + + // See if the connection was meant to be a part of a RAS link and if so, try to initiate fallback + // connection(s). At this point, it's mostly just a precaution; we will continue trying to establish + // the primary connection until RAS_PEER_DEAD_TIMEOUT expires. + conn->experiencingDelays = true; + (void)rasLinkAddFallback(&rasNextLink, connIdx); + (void)rasLinkAddFallback(&rasPrevLink, connIdx); + // rasConns may have been reallocated by the above calls. + conn = rasConns+connIdx; + + // Stop collectives from waiting for a response over it. + rasCollsPurgeConn(connIdx); + } // if (!conn->experiencingDelays) + } else { + *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_CONNECT_WARN); + } + + // If a socket was terminated (or never opened, due to some error), try to open it now. + // We retry once a second. + if (conn->sockIdx == -1) { + if (now - conn->lastRetryTime > RAS_CONNECT_RETRY) { + INFO(NCCL_RAS, "RAS trying to reconnect with %s (experiencingDelays %d, startRetryTime %.2fs)", + ncclSocketToString(&conn->addr, rasLine), conn->experiencingDelays, + (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0)); + rasConnOpen(conn); + } + if (conn->sockIdx == -1) + *nextWakeup = std::min(*nextWakeup, conn->lastRetryTime+RAS_CONNECT_RETRY); + } + } // if (conn->startRetryTime) + } // for (connIdx) +} + +// Checks if we have a connection to a given peer and if so, terminates it. The connection is removed from the +// RAS links, though fallbacks are initiated if necessary. Typically called just before declaring a peer dead. +void rasConnDisconnect(const union ncclSocketAddress* addr) { + int connIdx = rasConnFind(addr); + if (connIdx != -1) { + (void)rasLinkAddFallback(&rasNextLink, connIdx); + (void)rasLinkAddFallback(&rasPrevLink, connIdx); + rasLinkDropConn(&rasNextLink, connIdx); + rasLinkDropConn(&rasPrevLink, connIdx); + + rasConnTerminate(rasConns+connIdx); + } +} + +// Terminates a connection and frees the rasConns entry. +static void rasConnTerminate(struct rasConnection* conn) { + int connIdx = conn - rasConns; + + // Make sure there are no lingering rasSockets pointing to it. + for (int i = 0; i < nRasSockets; i++) { + struct rasSocket* sock = rasSockets+i; + if (sock->status != RAS_SOCK_CLOSED && sock->connIdx == connIdx) + rasSocketTerminate(sock, /*finalize*/true); + } + + // Also check any ongoing collectives. + rasCollsPurgeConn(connIdx); + + while (struct rasMsgMeta* meta = ncclIntruQueueTryDequeue(&conn->sendQ)) { + free(meta); + } + + INFO(NCCL_RAS, "RAS terminating a connection with %s", ncclSocketToString(&conn->addr, rasLine)); + + conn->inUse = false; + conn->sockIdx = -1; // Should be that way already, but just to be extra sure... +} + + +/////////////////////////////////////////// +// Functions related to the RAS sockets. // +/////////////////////////////////////////// + +// Accepts a new RAS network socket connection. The socket is not usable until after the handshake, as a +// corresponding rasConnection can't be established without knowing the peer's address. +ncclResult_t rasNetAcceptNewSocket() { + ncclResult_t ret = ncclSuccess; + struct rasSocket* sock; + int ready; + bool socketInitialized = false; + NCCLCHECKGOTO(getNewSockEntry(&sock), ret, fail); + + NCCLCHECKGOTO(ncclSocketInit(&sock->sock, nullptr, NCCL_SOCKET_MAGIC, ncclSocketTypeRasNetwork, nullptr, + /*asyncFlag*/1), ret, fail); + socketInitialized = true; + NCCLCHECKGOTO(ncclSocketAccept(&sock->sock, &rasNetListeningSocket), ret, fail); + NCCLCHECKGOTO(ncclSocketReady(&sock->sock, &ready), ret, fail); + + if (sock->sock.fd != -1) { + NCCLCHECKGOTO(rasGetNewPollEntry(&sock->pfd), ret, fail); + rasPfds[sock->pfd].fd = sock->sock.fd; + rasPfds[sock->pfd].events = POLLIN; // Initially we'll just wait for a handshake from the other side. This also + // helps the code tell the sides apart. + sock->status = RAS_SOCK_CONNECTING; + + INFO(NCCL_RAS, "RAS new incoming socket connection from %s", ncclSocketToString(&sock->sock.addr, rasLine)); + } + +exit: + return ret; +fail: + if (socketInitialized) + NCCLCHECK(ncclSocketClose(&sock->sock)); + goto exit; +} + +// Returns the index of the first available entry in the rasConns array, enlarging the array if necessary. +static ncclResult_t getNewSockEntry(struct rasSocket** pSock) { + struct rasSocket* sock; + int i; + for (i = 0; i < nRasSockets; i++) + if (rasSockets[i].status == RAS_SOCK_CLOSED) + break; + if (i == nRasSockets) { + NCCLCHECK(ncclRealloc(&rasSockets, nRasSockets, nRasSockets+RAS_INCREMENT)); + nRasSockets += RAS_INCREMENT; + } + + sock = rasSockets+i; + memset(sock, '\0', sizeof(*sock)); + sock->pfd = -1; + sock->connIdx = -1; + sock->createTime = sock->lastSendTime = sock->lastRecvTime = clockNano(); + + *pSock = sock; + return ncclSuccess; +} + +// Invoked from the main RAS event loop to handle RAS socket timeouts. +void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup) { + for (int sockIdx = 0; sockIdx < nRasSockets; sockIdx++) { + struct rasSocket* sock = rasSockets+sockIdx; + + if (sock->status == RAS_SOCK_CLOSED) + continue; + + // For socket connections that are still being established, give up on the ones that take too long to initialize. + if (sock->status == RAS_SOCK_CONNECTING || sock->status == RAS_SOCK_HANDSHAKE) { + if (now - sock->createTime > RAS_STUCK_TIMEOUT) { + if (sock->connIdx == -1) { + INFO(NCCL_RAS, "RAS init timeout error (%lds) on incoming socket connection from %s", + (now-sock->createTime)/CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine)); + } else { + struct rasConnection* conn = rasConns+sock->connIdx; + INFO(NCCL_RAS, "RAS init timeout error (%lds) on socket connection with %s " + "(experiencingDelays %d, startRetryTime %.2fs, socket status %d)", + (now-sock->createTime)/CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine), + conn->experiencingDelays, (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0), + sock->status); + } + rasSocketTerminate(sock, /*finalize*/true); + // We may retry later. + continue; + } else { + *nextWakeup = std::min(*nextWakeup, sock->createTime+RAS_STUCK_TIMEOUT); + } + } // if (sock->status == RAS_SOCK_CONNECTING || sock->status == RAS_SOCK_HANDSHAKE) + + // For sockets that are being terminated, force finalization of the ones that haven't made progress in too long. + if (sock->status == RAS_SOCK_TERMINATING) { + if (now - std::max(sock->lastSendTime, sock->lastRecvTime) > RAS_STUCK_TIMEOUT) { + INFO(NCCL_RAS, "RAS termination stuck timeout error (%lds) on socket connection with %s", + (now-std::max(sock->lastSendTime, sock->lastRecvTime)) / CLOCK_UNITS_PER_SEC, + ncclSocketToString(&sock->sock.addr, rasLine)); + rasSocketTerminate(sock, /*finalize*/true); + // This socket is presumably already being re-established, if needed. + continue; + } else { + *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime, sock->lastRecvTime)+RAS_STUCK_TIMEOUT); + } + } // if (sock->status == RAS_SOCK_TERMINATING) + + // Terminate sockets that haven't been used in a good while. In principle this shouldn't trigger for anything + // important due to shorter timeouts on RAS network connections, but in case of weird situations like process + // suspend, rasSocketTerminate will do additional checking. + if (sock->status == RAS_SOCK_READY) { + if (now - std::max(sock->lastSendTime, sock->lastRecvTime) > RAS_IDLE_TIMEOUT) { + INFO(NCCL_RAS, "RAS idle timeout (%lds) on socket connection with %s", + (now - std::max(sock->lastSendTime, sock->lastRecvTime)) / CLOCK_UNITS_PER_SEC, + ncclSocketToString(&sock->sock.addr, rasLine)); + rasSocketTerminate(sock, /*finalize*/false, /*startRetryOffset*/0, /*retry*/false); + continue; + // The RAS network timeout handler will terminate the conn it was associated with, if any. + } else { + *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime, sock->lastRecvTime)+RAS_IDLE_TIMEOUT); + } + } // if (sock->status == RAS_SOCK_READY) + } // for (sockIdx) +} + +// Handles the termination of a RAS socket. +// We try to do it in stages for established sockets (in READY state). We shut down just the sending side +// for them and change their state to TERMINATING, so that we can still receive data that may be in the buffers. +// Once we get an EOF when receiving data, we finalize the termination. +// For not fully established sockets, we can terminate immediately as there's no useful data to extract. +void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRetryOffset, bool retry) { + assert(sock->status != RAS_SOCK_CLOSED); + if (sock->connIdx != -1) { + struct rasConnection* conn = rasConns+sock->connIdx; + // If the sockIdx of the connection points back to us, it means that we are the current socket of this + // connection, so we have additional work to do before we can terminate it. + if (conn->sockIdx == sock-rasSockets) { + // Reset it to indicate there's no valid socket associated with that connection anymore. + conn->sockIdx = -1; + + // Don't attempt to retry on sockets that have been unused for so long that the remote peer probably + // deliberately closed them. Make an exception for sockets that are part of the RAS network links. + if ((retry && + clockNano() - std::max(sock->lastSendTime, sock->lastRecvTime) < RAS_IDLE_TIMEOUT - RAS_IDLE_GRACE_PERIOD) || + rasLinkFindConn(&rasNextLink, sock->connIdx) != -1 || rasLinkFindConn(&rasPrevLink, sock->connIdx) != -1) { + // For connections that were fine until now, the connection-level timeout starts at termination, and possibly + // even earlier, depending on what event trigerred the termination -- if it was another timeout expiring, then + // we need to include that timeout as well. + if (conn->startRetryTime == 0) { + conn->startRetryTime = conn->lastRetryTime = clockNano() - startRetryOffset; + } + + // We also filter through the sendQ, eliminating any messages that won't need to be sent when the socket + // connection is re-established (that's essentially the server init and keep-alives). + // As ncclIntruQueue can't be iterated, we transfer the content in bulk to a temporary and then filter the + // messages as we move them back one-by-one. + struct ncclIntruQueue sendQTmp; + ncclIntruQueueConstruct(&sendQTmp); + ncclIntruQueueTransfer(&sendQTmp, &conn->sendQ); + while (struct rasMsgMeta* meta = ncclIntruQueueTryDequeue(&sendQTmp)) { + if (meta->msg.type != RAS_MSG_CONNINIT && meta->msg.type != RAS_MSG_CONNINITACK && + meta->msg.type != RAS_MSG_KEEPALIVE) { + if (meta->offset != 0) { + // Reset the progress of any partially-sent messages (they will need to be resent from the beginning; + // in principle that could apply to the first message only). + meta->offset = 0; + } + ncclIntruQueueEnqueue(&conn->sendQ, meta); + } else { // RAS_MSG_CONNINIT || RAS_MSG_CONNINITACK || RAS_MSG_KEEPALIVE + free(meta); + } + } // while (meta) + } // if (retry) + + // Stop collectives from waiting for a response over this connection. + rasCollsPurgeConn(sock->connIdx); + } // if (conn->sockIdx == sock-rasSockets) + } // if (sock->connIdx != -1) + + if (sock->status != RAS_SOCK_CONNECTING && sock->connIdx != -1 && !finalize && (rasPfds[sock->pfd].events & POLLIN)) { + if (sock->status != RAS_SOCK_TERMINATING) { + // The receiving side is still open -- close just the sending side. + (void)ncclSocketShutdown(&sock->sock, SHUT_WR); + rasPfds[sock->pfd].events &= ~POLLOUT; // Nothing more to send. + // The timeout for this socket starts ticking now... + sock->lastSendTime = clockNano(); + sock->status = RAS_SOCK_TERMINATING; + } + // Else it must be in RAS_SOCK_TERMINATING state already -- in that case we do nothing here and instead + // we wait for an EOF on the receiving side or for a timeout. + } else { + // Either the caller requested finalization or we cannot receive on it. + (void)ncclSocketClose(&sock->sock); + sock->status = RAS_SOCK_CLOSED; + rasPfds[sock->pfd].fd = -1; + rasPfds[sock->pfd].events = rasPfds[sock->pfd].revents = 0; + sock->pfd = sock->connIdx = -1; + sock->recvOffset = sock->recvLength = 0; + free(sock->recvMsg); + sock->recvMsg = nullptr; + } +} + +// Handles a ready socket FD from the main event loop. +void rasSockEventLoop(int sockIdx, int pollIdx) { + struct rasSocket* sock = rasSockets+sockIdx; + + if (sock->status == RAS_SOCK_CONNECTING) { + int ready; + // Socket is not yet fully established. Continue the OS or NCCL-level handshake. + if (ncclSocketReady(&sock->sock, &ready) != ncclSuccess) { + INFO(NCCL_RAS, "RAS unexpected error from ncclSocketReady; terminating the socket connection with %s", + ncclSocketToString(&sock->sock.addr, rasLine)); + rasSocketTerminate(sock); + // We may retry further down. + } else { + if (ready) { + // We can tell the connect-side based on what events is set to. + bool connectSide = (rasPfds[pollIdx].events & POLLOUT); + (connectSide ? sock->lastSendTime : sock->lastRecvTime) = clockNano(); + sock->status = RAS_SOCK_HANDSHAKE; + if (connectSide) { + assert(sock->connIdx != -1); + if (rasConns[sock->connIdx].sockIdx == sockIdx) { + if (rasConnPrepare(rasConns+sock->connIdx) != ncclSuccess) { + INFO(NCCL_RAS, "RAS unexpected error from rasConnPrepare; terminating the socket connection with %s", + ncclSocketToString(&sock->sock.addr, rasLine)); + rasSocketTerminate(sock); + // We may retry further down. + } + } else { + // The connection this socket is associated with no longer considers it to be the current one. + // This could possibly happen due to a race condition. Simply terminate it. + INFO(NCCL_RAS, "RAS connected with %s via a socket that's no longer current!", + ncclSocketToString(&sock->sock.addr, rasLine)); + rasSocketTerminate(sock); + } + } // if (connectSide) + } else { // !ready + if (sock->sock.state == ncclSocketStateConnecting) + rasPfds[sock->pfd].fd = POLL_FD_IGNORE; // Don't poll on this socket before connect(). + } + } // if (ncclSocketReady) + } else { // RAS_SOCK_HANDSHAKE || RAS_SOCK_READY || RAS_SOCK_TERMINATING. + // The extra test for TERMINATING is there to take care of a race when the handling of one socket + // results in another socket being terminated, but one that already has revents waiting from poll. + if (sock->status != RAS_SOCK_TERMINATING && (rasPfds[pollIdx].revents & POLLOUT)) { + int closed = 0; + bool allSent = false; + assert(sock->connIdx != -1); + struct rasConnection* conn = rasConns+sock->connIdx; + assert(conn->sockIdx == sockIdx); + if (rasConnSendMsg(conn, &closed, &allSent) != ncclSuccess) { + INFO(NCCL_RAS, "RAS unexpected error from rasConnSendMsg; terminating the socket connection with %s", + ncclSocketToString(&sock->sock.addr, rasLine)); + rasSocketTerminate(sock); + // We may retry further down. + } else if (closed) { + INFO(NCCL_RAS, "RAS socket connection with %s closed by peer on send; terminating it", + ncclSocketToString(&sock->sock.addr, rasLine)); + rasSocketTerminate(sock); + // We may retry further down. + } else { + sock->lastSendTime = clockNano(); + if (allSent) + rasPfds[sock->pfd].events &= ~POLLOUT; // Nothing more to send for now. + } + } + if (rasPfds[pollIdx].revents & POLLIN) { + struct rasMsg* msg; + do { + int closed = 0; + msg = nullptr; + if (rasMsgRecv(sock, &msg, &closed) != ncclSuccess) { + INFO(NCCL_RAS, "RAS unexpected error from rasMsgRecv; terminating the socket connection with %s", + ncclSocketToString(&sock->sock.addr, rasLine)); + rasSocketTerminate(sock, /*finalize*/true); + // We may retry further down. + } else if (closed) { + const char* socketType; + if (sock->connIdx == -1) + socketType = "incoming"; + else if (rasConns[sock->connIdx].sockIdx != sockIdx) + socketType = "old"; + else if (sock->status == RAS_SOCK_HANDSHAKE) + socketType = "new"; + else + socketType = "current"; + INFO(NCCL_RAS, "RAS %s socket connection with %s closed by peer on receive; terminating it", + socketType, ncclSocketToString(&sock->sock.addr, rasLine)); + rasSocketTerminate(sock, /*finalize*/true); + // We may retry further down. + } else { + sock->lastRecvTime = clockNano(); + if (msg) { + (void)rasMsgHandle(msg, sock); + free(msg); + // Message handlers can terminate a socket in certain cases; we need to check for + // that here so that we don't try to receive from a closed socket. + // No handlers are currently believed to create new sockets but better to be safe than sorry + // and re-init the sock variable. + sock = rasSockets+sockIdx; + if (sock->status == RAS_SOCK_CLOSED) + break; + } + if (sock->connIdx != -1) { + struct rasConnection* conn = rasConns+sock->connIdx; + if (conn->sockIdx == sockIdx && (conn->startRetryTime || conn->experiencingDelays)) + rasConnResume(conn); + } + } + } while (msg); + } // if (POLLIN) + } // RAS_SOCK_HANDSHAKE || RAS_SOCK_READY || RAS_SOCK_TERMINATING +} + + +//////////////////////////////////////////////////////////////// +// Functions related to the handling of RAS network timeouts. // +//////////////////////////////////////////////////////////////// + +// Invoked from the main RAS event loop to handle RAS network timeouts. +void rasNetHandleTimeouts(int64_t now, int64_t* nextWakeup) { + // A connection can belong to multiple links but, when it comes to various timeouts, we want to handle each + // connection just once. We solve that with a simple flag within a connection. This also allows us to distinguish + // connections that are part of a link from those that are not. + for (int connIdx = 0; connIdx < nRasConns; connIdx++) + rasConns[connIdx].linkFlag = false; + + (void)rasLinkHandleNetTimeouts(&rasNextLink, now, nextWakeup); + (void)rasLinkHandleNetTimeouts(&rasPrevLink, now, nextWakeup); + + for (int connIdx = 0; connIdx < nRasConns; connIdx++) { + struct rasConnection* conn = rasConns+connIdx; + if (conn->inUse && !conn->linkFlag) { + // The connection is not part of any link. Check if it should be terminated. + if (conn->sockIdx == -1 && ncclIntruQueueEmpty(&conn->sendQ)) { + rasConnTerminate(conn); + continue; + } + } + } +} + +// Checks for and handles timeouts at the link level; primarily the keep-alives for link connections. +static ncclResult_t rasLinkHandleNetTimeouts(struct rasLink* link, int64_t now, int64_t* nextWakeup) { + for (int i = 0; i < link->nConns; i++) { + struct rasLinkConn* linkConn = link->conns+i; + if (linkConn->connIdx != -1) { + if (!rasConns[linkConn->connIdx].linkFlag) { + rasConnHandleNetTimeouts(linkConn->connIdx, now, nextWakeup); + // rasConns may have been reallocated by the above call, which is why we don't have a conn variable here. + // For the same reason we re-init linkConn. + linkConn = link->conns+i; + rasConns[linkConn->connIdx].linkFlag = true; + } + } else if (i == 0 && link->lastUpdatePeersTime != 0) { + // This triggers when rasLinkReinitConns didn't create the primary connection because we have a higher address + // than the peer. If that peer fails to initiate within RAS_CONNECT_WARN, we need to take action. + if (now - link->lastUpdatePeersTime > RAS_CONNECT_WARN) { + INFO(NCCL_RAS, "RAS peer connect timeout warning (%lds) on socket connection from %s", + (now-link->lastUpdatePeersTime) / CLOCK_UNITS_PER_SEC, + ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine)); + NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->connIdx)); + if (linkConn->connIdx != -1) { + rasConns[linkConn->connIdx].linkFlag = true; + } + // We used to connect to the first fallback but I think trying to connect to the calculated primary first + // in this case is more intuitive. + //(void)rasLinkTryFallback(link, -1); + link->lastUpdatePeersTime = 0; + } else { + *nextWakeup = std::min(*nextWakeup, link->lastUpdatePeersTime+RAS_CONNECT_WARN); + } + } // if (i == 0 && link->lastUpdatePeerTime != 0) + } // for (i) + + return ncclSuccess; +} + +// Handles the sending of keep-alive messages and related timeouts for connections that are part of the RAS links. +static void rasConnHandleNetTimeouts(int connIdx, int64_t now, int64_t* nextWakeup) { + struct rasConnection* conn = rasConns+connIdx; + if (conn->sockIdx != -1) { + struct rasSocket* sock = rasSockets+conn->sockIdx; + + if (sock->status == RAS_SOCK_READY) { + // Send a regular keep-alive message if we haven't sent anything in a while and we don't have anything queued. + if (ncclIntruQueueEmpty(&conn->sendQ)) { + if (now - sock->lastSendTime > RAS_KEEPALIVE_INTERVAL) { + rasConnSendKeepAlive(conn); + } else { + *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_KEEPALIVE_INTERVAL); + } + } + + // For short timeouts print a warning but also pessimistically immediately try to establish fallback connections. + if (now - sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_WARN) { + if (!conn->experiencingDelays) { + INFO(NCCL_RAS, "RAS keep-alive timeout warning (%lds) on socket connection with %s", + (now-sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine)); + + // At this point, it's mostly just a precaution; we will continue with the primary connection until + // RAS_PEER_DEAD_TIMEOUT expires. + conn->experiencingDelays = true; + (void)rasLinkAddFallback(&rasNextLink, connIdx); + (void)rasLinkAddFallback(&rasPrevLink, connIdx); + // rasConns and rasSockets may have been reallocated by the above calls. + conn = rasConns+connIdx; + sock = rasSockets+conn->sockIdx; + + // Stop collectives from waiting for a response over it. + rasCollsPurgeConn(connIdx); + } + } else { + *nextWakeup = std::min(*nextWakeup, sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_WARN); + } + + // For long timeouts we need to act. + if (now - sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_ERROR) { + INFO(NCCL_RAS, "RAS keep-alive timeout error (%lds) on socket connection with %s", + (now-sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine)); + rasSocketTerminate(sock, /*finalize*/true, RAS_KEEPALIVE_TIMEOUT_ERROR); + *nextWakeup = now; // Retry will be in the next iteration of the main loop so ensure we don't wait. + } else { + *nextWakeup = std::min(*nextWakeup, sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_ERROR); + } + } // if (sock->status == RAS_SOCK_READY) + } // if (conn->sockIdx != -1) +} + +// Sends a keep-alive message to a peer on the RAS network. +static void rasConnSendKeepAlive(struct rasConnection* conn, bool nack) { + struct rasMsg* msg = nullptr; + int msgLen = rasMsgLength(RAS_MSG_KEEPALIVE); + if (rasMsgAlloc(&msg, msgLen) == ncclSuccess) { + int linkIdx; + msg->type = RAS_MSG_KEEPALIVE; + msg->keepAlive.peersHash = rasPeersHash; + msg->keepAlive.deadPeersHash = rasDeadPeersHash; + msg->keepAlive.nack = (nack ? 1 : 0); + + linkIdx = rasLinkFindConn(&rasNextLink, conn-rasConns); + if (linkIdx != -1 && !rasNextLink.conns[linkIdx].external) + msg->keepAlive.linkMask |= 2; // Our rasNextLink should be the peer's rasPrevLink. + linkIdx = rasLinkFindConn(&rasPrevLink, conn-rasConns); + if (linkIdx != -1 && !rasPrevLink.conns[linkIdx].external) + msg->keepAlive.linkMask |= 1; // Our rasPrevLink should be the peer's rasNextLink. + + (void)clock_gettime(CLOCK_REALTIME, &msg->keepAlive.realTime); + + rasConnEnqueueMsg(conn, msg, msgLen); + } +} + +// Handles incoming keep-alive messages. +ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* sock) { + struct timespec currentTime; + int64_t travelTime; + int peerIdx; + + assert(sock->connIdx != -1); + struct rasConnection* conn = rasConns+sock->connIdx; + SYSCHECK(clock_gettime(CLOCK_REALTIME, ¤tTime), "clock_gettime"); + travelTime = (currentTime.tv_sec-msg->keepAlive.realTime.tv_sec)*1000*1000*1000 + + (currentTime.tv_nsec-msg->keepAlive.realTime.tv_nsec); + + if (msg->keepAlive.peersHash != conn->lastRecvPeersHash) { + conn->lastRecvPeersHash = msg->keepAlive.peersHash; + } + if (msg->keepAlive.deadPeersHash != conn->lastRecvDeadPeersHash) { + conn->lastRecvDeadPeersHash = msg->keepAlive.deadPeersHash; + } + + // Make sure that the connection is part of the appropriate links forming the RAS network. In particular, this + // will add any externally-requested connections to the appropriate links (or remove existing ones, if no longer + // needed). + peerIdx = rasPeerFind(&conn->addr); + // Note: it's possible for peerIdx to be -1 at this point if, due to races, the keepAlive arrives before + // the peers update. + (void)rasLinkUpdateConn(&rasNextLink, (msg->keepAlive.linkMask & 1) ? sock->connIdx : -1, peerIdx, /*external*/true); + (void)rasLinkUpdateConn(&rasPrevLink, (msg->keepAlive.linkMask & 2) ? sock->connIdx : -1, peerIdx, /*external*/true); + + // If the keep-alive message is from a peer that doesn't actually need this connection (i.e., for that peer the + // connection is just an external fallback), we should check if *we* still need it. It might be that we don't, + // and because we stopped sending the keep-alives, our peer doesn't know about it. rasLinkUpdateConn calls above + // will have wiped any external fallbacks, so anything that remains must be needed. + if (!msg->keepAlive.nack && msg->keepAlive.linkMask == 0) { + if (rasLinkFindConn(&rasNextLink, sock->connIdx) == -1 && rasLinkFindConn(&rasPrevLink, sock->connIdx) == -1) { + // We don't need this connection either. Notify the peer about it. To avoid an infinite loop, we set the + // special nack flag in the message to distinguish it from regular keep-alives. + rasConnSendKeepAlive(conn, /*nack*/true); + } + } + + if (conn->travelTimeMin > travelTime) + conn->travelTimeMin = travelTime; + if (conn->travelTimeMax < travelTime) + conn->travelTimeMax = travelTime; + conn->travelTimeSum += travelTime; + conn->travelTimeCount++; + + if (msg->keepAlive.peersHash != rasPeersHash || msg->keepAlive.deadPeersHash != rasDeadPeersHash) { + // This could happen due to a short-lived race condition between the peers propagation + // process and the periodic keep-alive messages (perhaps we'll see it regularly at scale?). + // Just in case there's some unforeseen problem with the peers propagation though, exchange with the + // remote to get everybody in sync. + INFO(NCCL_RAS, "RAS keepAlive hash mismatch from %s (peersHash 0x%lx, deadPeersHash 0x%lx)", + ncclSocketToString(&sock->sock.addr, rasLine), msg->keepAlive.peersHash, msg->keepAlive.deadPeersHash); + INFO(NCCL_RAS, "RAS my peersHash 0x%lx, deadPeersHash 0x%lx", rasPeersHash, rasDeadPeersHash); + NCCLCHECK(rasConnSendPeersUpdate(conn, rasPeers, nRasPeers)); + } + return ncclSuccess; +} + + +/////////////////////////////////////////////////////////////////////////////// +// Functions related to the RAS links and recovery from connection failures. // +/////////////////////////////////////////////////////////////////////////////// + +// Checks if the connection (that we just detected some problem with) is part of the RAS link and if so, +// tries to initiate a(nother) fallback connection if needed. +// External connections are generally ignored by this whole process: in particular, we don't add fallbacks for +// timing out external connections. However, we will use an active external connection if it would be a better +// option than whatever we can come up with. +static ncclResult_t rasLinkAddFallback(struct rasLink* link, int connIdx) { + int peerIdx = -1; + int linkIdx = -1; + int firstExtLinkIdx = -1; + int newPeerIdx; + + // First check if the connection is part of this link. In the process also check if any of the link's connections + // might be active -- if so, there's no need to initiate any more fallbacks and we can bail out. + for (int i = 0; i < link->nConns; i++) { + struct rasLinkConn* linkConn = link->conns+i; + + if (linkConn->peerIdx == -1) { + // Such elements are always at the very end of the array and we can't use them so we can just as well break. + break; + } + + // Check for any other connection that might be a viable fallback (basically, anything that is not experiencing + // delays). + if (linkConn->connIdx != -1 && linkConn->connIdx != connIdx) { + struct rasConnection* conn = rasConns+linkConn->connIdx; + if (!conn->experiencingDelays) { + if (!linkConn->external) + goto exit; // We don't need to do anything if there's a non-external connection. + else if (linkConn->peerIdx != -1) { + // Record the location of the first potentially viable external connection in the chain; we may prefer it + // over anything we can come up with. + if (firstExtLinkIdx == -1) + firstExtLinkIdx = i; + if (linkIdx != -1) + break; // Break out of the loop if we already have all the data we might need. + } // linkConn->external && linkConn->peerIdx != -1 + } // if (!conn->experiencingDelays) + } // if (linkConn->connIdx != -1) + + if (linkConn->connIdx == connIdx) { + if (linkConn->external) + goto exit; // We don't add fallbacks for external connections... + peerIdx = linkConn->peerIdx; + linkIdx = i; + // We are not breaking out of the loop here because we want to check for active connections on *all* potentially + // viable elements (in particular, there could be some external ones beyond this one). + } + } + + if (linkIdx == -1) + goto exit; + + // We found an existing element so the connection is part of the link. No existing non-external connections of this + // link are active, so a fallback is needed. + assert(peerIdx != -1); + newPeerIdx = rasLinkCalculatePeer(link, peerIdx, /*isFallback*/linkIdx > 0); + // In principle we want to add (at most) one fallback. However, if the found fallback connection already exists + // and is also experiencing delays, we need to keep iterating. + while (newPeerIdx != -1) { + int newConnIdx = rasConnFind(&rasPeers[newPeerIdx].addr); + // If we previously found a potential external fallback connection, check if it's better than what we just found. + if (firstExtLinkIdx != -1) { + linkIdx = -1; + // Calculate the index that the newly found fallback would have (pretend mode). + NCCLCHECK(rasLinkUpdateConn(link, newConnIdx, newPeerIdx, /*external*/false, /*insert*/true, /*pretend*/true, + &linkIdx)); + assert(linkIdx != -1); + if (firstExtLinkIdx < linkIdx) { + // The external connection *is* better -- use it as a fallback instead and be done. + link->conns[firstExtLinkIdx].external = false; + goto exit; + } + } + NCCLCHECK(rasLinkUpdateConn(link, newConnIdx, newPeerIdx, /*external*/false, /*insert*/true, /*pretend*/false, + &linkIdx)); + if (firstExtLinkIdx != -1 && linkIdx <= firstExtLinkIdx) + firstExtLinkIdx++; // Adjust if we inserted a new conn at a lower index. + + INFO(NCCL_RAS, "RAS link %d: %s fallback connection %d with %s", + link->direction, (newConnIdx == -1 ? "opening new" : "calculated existing"), + linkIdx, ncclSocketToString(&rasPeers[newPeerIdx].addr, rasLine)); + // Note that we don't follow here our convention of "lower address is the one establishing connections" -- + // that convention is for optimizing regular operations, but we don't want to take chances during fault + // recovery. It may temporarily result in duplicate connections, but we have a mechanism to deal with those. + if (newConnIdx == -1) + NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &link->conns[linkIdx].connIdx)); + + struct rasConnection* conn = rasConns+link->conns[linkIdx].connIdx; + // If the fallback connection is also experiencing delays, we need to keep trying. + if (!conn->experiencingDelays) + break; + INFO(NCCL_RAS, "RAS connection experiencingDelays %d, startRetryTime %.2fs, socket status %d", + conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0), + (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status)); + + newPeerIdx = rasLinkCalculatePeer(link, newPeerIdx, /*isFallback*/true); + } + if (newPeerIdx == -1) + INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (nConns %d)", link->direction, link->nConns); +exit: + return ncclSuccess; +} + +// Invoked when we receive a message over a connection that was just activated or was experiencing delays. +// Cleans up the fallbacks, timers, etc, as appropriate. +static void rasConnResume(struct rasConnection* conn) { + if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY) { + INFO(NCCL_RAS, "RAS %s connection with %s (sendQ %sempty, experiencingDelays %d, startRetryTime %.2fs)", + (conn->experiencingDelays && conn->startRetryTime == 0 ? "recovered" : "established"), + ncclSocketToString(&conn->addr, rasLine), (ncclIntruQueueEmpty(&conn->sendQ) ? "" : "not "), + conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0)); + + conn->experiencingDelays = false; + + conn->startRetryTime = conn->lastRetryTime = 0; + + rasLinkSanitizeFallbacks(&rasNextLink); + rasLinkSanitizeFallbacks(&rasPrevLink); + + if (!ncclIntruQueueEmpty(&conn->sendQ)) + rasPfds[rasSockets[conn->sockIdx].pfd].events |= POLLOUT; + } +} + +// Checks if the primary connection is fully established and if so, purges the fallbacks (as they are no longer needed). +static void rasLinkSanitizeFallbacks(struct rasLink* link) { + if (link->nConns > 0 && link->conns[0].connIdx != -1) { + struct rasConnection* conn = rasConns+link->conns[0].connIdx; + if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY && !conn->experiencingDelays) { + // We have a good primary. Simply drop all the fallbacks (the external ones will get recreated via the + // keepAlive messages). + for (int i = 1; i < link->nConns; i++) { + INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s", + link->direction, (link->conns[i].external ? "external " : ""), i, + ncclSocketToString(&rasConns[link->conns[i].connIdx].addr, rasLine)); + } + link->nConns = 1; + link->lastUpdatePeersTime = 0; + } + } +} + +// Attempt to drop a connection from a link. +static void rasLinkDropConn(struct rasLink* link, int connIdx, int linkIdx) { + if (linkIdx == -1) + linkIdx = rasLinkFindConn(link, connIdx); + if (linkIdx != -1) { + if (linkIdx == 0) { + INFO(NCCL_RAS, "RAS link %d: dropping primary connection with %s", + link->direction, ncclSocketToString(&rasConns[connIdx].addr, rasLine)); + } else { + INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s", + link->direction, (link->conns[linkIdx].external ? "external " : ""), linkIdx, + ncclSocketToString(&rasConns[connIdx].addr, rasLine)); + } + memmove(link->conns+linkIdx, link->conns+linkIdx+1, (link->nConns-(linkIdx+1))*sizeof(*link->conns)); + if (link->nConns > 1) + link->nConns--; + else { + link->conns[0].peerIdx = link->conns[0].connIdx = -1; + } + + if (linkIdx == 0) { + // First ensure that the conn becoming the primary is not marked as external (we don't want to lose it if + // the remote peer loses interest in it). + link->conns[0].external = false; + if (link->conns[0].connIdx != -1) { + INFO(NCCL_RAS, "RAS link %d: former fallback connection 1 with %s is the new primary", + link->direction, ncclSocketToString(&rasConns[link->conns[0].connIdx].addr, rasLine)); + } + rasLinkSanitizeFallbacks(link); + } + } +} + +// Checks if a given connection is a member of this link and if so, returns its entry index. +// Returns -1 if connection not found. +static int rasLinkFindConn(const struct rasLink* link, int connIdx) { + for (int i = 0; i < link->nConns; i++) { + if (link->conns[i].connIdx == connIdx) + return i; + } + return -1; +} + +// Note: the behavior of this function has become super-complex and so it should be considered for refactoring. +// Searches for and updates an entry in a RAS network link. The conns array is de-facto sorted by peerIdx: it is +// ordered by preference, though peerIdx values can wrap around (given the ring/torus topology) and they can also +// be -1 (the latter are stored at the end). +// external provides an updated value for the entry's external field. A false value, if requested, is always set; +// a true value, however, is only set if a new entry is added (external == true implies insert), i.e., if an entry +// already exists and the function is invoked with external == true, the new value will be ignored. +// If insert is set, it will, if necessary, insert a new entry if one is not already there. +// If pretend is set, it will not modify the array and will just set *pLinkIdx as appropriate. +// pLinkIdx is a pointer to an (optional) result where the index of the added/updated entry is stored. +// -1 can be passed as peerIdx if unknown (possible in case of race conditions, and only if external). +// -1 can be passed as connIdx if unknown or, if insert is *not* set, to indicate that the entry is to be removed +// (the entry's external must match the argument external for it to be removed). +ncclResult_t rasLinkUpdateConn(struct rasLink* link, int connIdx, int peerIdx, bool external, bool insert, + bool pretend, int* pLinkIdx) { + int i, oldLinkIdx = -1; + + if (external && connIdx != -1) + insert = true; + + if (connIdx != -1) { + // Start by checking if we already have an element with this connIdx. + oldLinkIdx = rasLinkFindConn(link, connIdx); + if (oldLinkIdx != -1) { + struct rasLinkConn* linkConn = link->conns+oldLinkIdx; + if (linkConn->peerIdx != -1) + assert(linkConn->peerIdx == peerIdx); + + if (linkConn->peerIdx == peerIdx) { + if (!external && !pretend) + linkConn->external = false; // Ensure that external is cleared if so requested. + if (pLinkIdx) + *pLinkIdx = oldLinkIdx; + goto exit; // Nothing more to do if both connIdx and peerIdx are up to date. + } + + // Otherwise (linkConn->peerIdx == -1 && peerIdx != -1) we have a conn that, due to -1 peerIdx, is in a wrong + // place in the array -- we need to find the right spot. linkConn->peerIdx == -1 can only happen for external + // connections. + assert(external); + } + } + + if (peerIdx != -1) { + // Search for the right spot in the conns array. + for (i = 0; i < link->nConns; i++) { + struct rasLinkConn* linkConn = link->conns+i; + if (peerIdx != -1 && linkConn->peerIdx == peerIdx) { + // The exact conn element already exists. + if (connIdx == -1 && !insert) { + // Drop the connection from the link. + if (linkConn->external == external) { + if (!pretend) + rasLinkDropConn(link, linkConn->connIdx, i); + else if (pLinkIdx) + *pLinkIdx = i; + } + } else { // connIdx != -1 || insert + if (!pretend) { + if (linkConn->connIdx != -1) + assert(linkConn->connIdx == connIdx); + else + linkConn->connIdx = connIdx; + if (!external) + linkConn->external = false; // Ensure that external is cleared if so requested. + if (i == 0) { + // We received a connection from the remote peer that matches the primary connection we've been + // waiting for. + rasLinkSanitizeFallbacks(link); + } + } // if (!pretend) + if (pLinkIdx) + *pLinkIdx = i; + } // connIdx != -1 || insert + + goto exit; + } // if (peerIdx != -1 && linkConn->peerIdx == peerIdx) + if (!insert) + continue; + // Ensure that the i-1 index is also valid. + if (i == 0) + continue; + // Conns with peerIdx == -1 are stored at the end, so anything else needs to go before them. + if (peerIdx != -1 && linkConn->peerIdx == -1) + break; + // Detect a roll-over and handle it specially. + if (link->direction * (link->conns[i-1].peerIdx - linkConn->peerIdx) > 0) { + if (link->direction * (peerIdx - link->conns[i-1].peerIdx) > 0 || + link->direction * (peerIdx - linkConn->peerIdx) < 0) + break; + } else { // Regular, monotonic case with the peerIdx value between two existing elements. + if (link->direction * (peerIdx - link->conns[i-1].peerIdx) > 0 && + link->direction * (peerIdx - linkConn->peerIdx) < 0) + break; + } + } // for (i) + } else { + // If peerIdx == -1, insert the new element at the very end. This can only happen for external connections. + assert(external && oldLinkIdx == -1); + i = link->nConns; + } + if (!insert) + goto exit; + + // i holds the index at which to insert a new element. + if (pretend) { + if (pLinkIdx) + *pLinkIdx = i; + goto exit; + } + + if (oldLinkIdx == -1) { + struct rasLinkConn* linkConn; + if (link->nConns == link->connsSize) { + NCCLCHECK(ncclRealloc(&link->conns, link->connsSize, link->connsSize+RAS_INCREMENT)); + link->connsSize += RAS_INCREMENT; + } + linkConn = link->conns+i; + // Shift existing conns with indices >= i to make room for the new one. + memmove(linkConn+1, linkConn, (link->nConns-i)*sizeof(*link->conns)); + linkConn->peerIdx = peerIdx; + linkConn->connIdx = connIdx; + linkConn->external = external; + if (external) { + INFO(NCCL_RAS, "RAS link %d: adding external fallback connection %d with %s", link->direction, i, + ncclSocketToString((connIdx != -1 ? &rasConns[connIdx].addr : &rasPeers[peerIdx].addr), rasLine)); + } + link->nConns++; + } + else { // oldLinkIdx > -1 + // We already have the conn, we just need to move it to a new spot. + struct rasLinkConn* linkConn = link->conns+i; + assert(i <= oldLinkIdx); // We can only get here if linkConn->peerIdx == -1 && peerIdx != -1. + if (i != oldLinkIdx) { + struct rasLinkConn tmp; + struct rasLinkConn* linkConnNext = link->conns+i+1; // Just to silence the compiler. + // Move the existing conn from index oldLinkIdx to a (lower) index i, shifting the existing conns + // with indices in the range [i, oldLinkIdx). + memcpy(&tmp, link->conns+oldLinkIdx, sizeof(tmp)); + memmove(linkConnNext, linkConn, (oldLinkIdx-i)*sizeof(*linkConn)); + memcpy(linkConn, &tmp, sizeof(*linkConn)); + } + if (!external) + linkConn->external = false; // Ensure that external is cleared if so requested. + } // oldLinkIdx > -1 + if (pLinkIdx) + *pLinkIdx = i; +exit: + return ncclSuccess; +} diff --git a/src/register.cc b/src/register.cc deleted file mode 100644 index c4ca4b4a0..000000000 --- a/src/register.cc +++ /dev/null @@ -1,204 +0,0 @@ -/************************************************************************* - * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "argcheck.h" // Need some checks here since we access comm -#include "nccl.h" -#include "comm.h" -#include "net.h" -#include "register.h" -#include "transport.h" - -ncclResult_t ncclNetDeregister(struct ncclComm* comm, struct ncclReg* reg) { - struct ncclRegCache* cache = &comm->regCache; - ncclDebugNoWarn = NCCL_NET; - for (int d=0; dnDevs; d++) { - if (reg->handles[d] != NULL) NCCLCHECK(comm->ncclNet->deregMr(cache->sComms[reg->devs[d]], reg->handles[d])); - } - reg->nDevs = 0; - free(reg->handles); - reg->handles = NULL; - ncclDebugNoWarn = 0; - return ncclSuccess; -} - -ncclResult_t ncclNetRegister(struct ncclComm* comm, void* addr, size_t size, struct ncclReg* reg) { - struct ncclRegCache* cache = &comm->regCache; - int netCount = 0; - if (comm->topo != NULL) NCCLCHECK(ncclTopoGetNetCount(comm->topo, &netCount)); - if (netCount == 0) return ncclSuccess; - - ncclResult_t ret = ncclSuccess; - - // Find local devices for p2p operations - for (int c=0; cp2pnChannels; c++) { - int dev; - if (ncclTopoGetLocalNet(comm->topo, comm->rank, c, NULL, &dev) != ncclSuccess) goto end; // No local net - ncclNetProperties_t props; - NCCLCHECKGOTO(comm->ncclNet->getProperties(dev, &props), ret, end); - if (props.regIsGlobal == 0) { // We need to be sure all NICs support global registration. - reg->nDevs = 0; - break; - } - int found = 0; - for (int d=0; dnDevs; d++) if (reg->devs[d] == dev) found = 1; - if (!found) reg->devs[reg->nDevs++] = dev; - } - - NCCLCHECKGOTO(ncclCalloc(®->handles, reg->nDevs), ret, end); - - ncclDebugNoWarn = NCCL_NET; - for (int d=0; dnDevs; d++) { - int dev = reg->devs[d]; - reg->handles[d] = NULL; - - if (cache->sComms[dev] == NULL) { - // Create a loopback network comm object for that device to register the buffers. - void *lComm = NULL; - ncclNetHandle_t netHandle; - bool connected = false; - NCCLCHECKGOTO(comm->ncclNet->listen(dev, &netHandle, &lComm), ret, end); - while (!connected) { - if (*comm->abortFlag) { - goto end; - } - if (cache->sComms[dev] == NULL) - NCCLCHECKGOTO(comm->ncclNet->connect(dev, &netHandle, cache->sComms+dev, NULL), ret, end); - if (cache->rComms[dev] == NULL) - NCCLCHECKGOTO(comm->ncclNet->accept(lComm, cache->rComms+dev, NULL), ret, end); - connected = (cache->rComms[dev] != NULL) && (cache->sComms[dev] != NULL); - } - NCCLCHECK(comm->ncclNet->closeListen(lComm)); - } - if (comm->ncclNet->regMr(cache->sComms[dev], addr, size, NCCL_PTR_CUDA, reg->handles+d) != ncclSuccess) { - reg->handles[d] = NULL; - NCCLCHECK(ncclNetDeregister(comm, reg)); - reg->nDevs = 0; - goto end; - } - } -end: - INFO(NCCL_INIT, "Register ptr %p size %ld on %d net devices", addr, size, reg->nDevs); - ncclDebugNoWarn = 0; - if (ret != ncclSuccess) NCCLCHECK(ncclNetDeregister(comm, reg)); - return ret; -} - -ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg) { - struct ncclRegCache* cache = &comm->regCache; - uintptr_t pageSize = cache->pageSize; - uintptr_t addr = (uintptr_t)data & -pageSize; - size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize; - - *reg = NULL; - for (int slot=0; /*true*/; slot++) { - if (slot == cache->population || addr < cache->slots[slot]->addr) return ncclSuccess; - if ((addr >= cache->slots[slot]->addr) && - ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) { - *reg = cache->slots[slot]; - return ncclSuccess; - } - } -} -NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1); - -ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, void** handle) { - if (!ncclParamLocalRegister()) { - *handle = NULL; - return ncclSuccess; - } - INFO(NCCL_REG, "register comm %p buffer %p size %zi", comm, data, size); - struct ncclRegCache* cache = &comm->regCache; - uintptr_t pageSize = cache->pageSize; - uintptr_t addr = (uintptr_t)data & -pageSize; - size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize; - for (int slot=0; /*true*/; slot++) { - if ((slot == cache->population) || (addr < cache->slots[slot]->addr)) { - if (cache->population == cache->capacity) { // must grow cache - cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity; - NCCLCHECK(ncclRealloc(&cache->slots, cache->population, cache->capacity)); - } - memmove(cache->slots+slot+1, cache->slots+slot, (cache->population-slot)*sizeof(struct ncclReg*)); - NCCLCHECK(ncclCalloc(cache->slots+slot, 1)); - struct ncclReg* regSlot = cache->slots[slot]; - regSlot->addr = addr; - regSlot->pages = pages; - regSlot->refs = 1; - NCCLCHECK(ncclNetRegister(comm, (void*)addr, pages*pageSize, regSlot)); - regSlot->state |= NET_REG_COMPLETE; - cache->population += 1; - *handle = regSlot; - return ncclSuccess; - } else if ((addr >= cache->slots[slot]->addr) && - ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) { - cache->slots[slot]->refs++; - *handle = cache->slots[slot]; - return ncclSuccess; - } - } -} - -ncclResult_t ncclRegCleanup(struct ncclComm* comm) { - struct ncclRegCache* cache = &comm->regCache; - for (int i=0; ipopulation; i++) { - INFO(NCCL_INIT, "Cleanup buffer %p pages %lx", (void*)cache->slots[i]->addr, cache->slots[i]->pages); - NCCLCHECK(ncclNetDeregister(comm, cache->slots[i])); - if (cache->slots[i]->state & NVLS_REG_COMPLETE) NCCLCHECK(ncclNvlsDeregBuffer(&cache->slots[i]->mcHandle, cache->slots[i]->regAddr, cache->slots[i]->dev, cache->slots[i]->regSize)); - free(cache->slots[i]); - } - free(cache->slots); - for (int d=0; dsComms[d]) NCCLCHECK(comm->ncclNet->closeSend(cache->sComms[d])); - if (cache->rComms[d]) NCCLCHECK(comm->ncclNet->closeRecv(cache->rComms[d])); - } - return ncclSuccess; -} - -NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle); -ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) { - NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm")); - if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(buff, comm, "buff", "ncclCommRegister")); - NCCLCHECK(ncclRegister(comm, buff, size, handle)); - return ncclSuccess; -} - -NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle); -ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) { - NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm")); - struct ncclReg* reg = (struct ncclReg*)handle; - struct ncclRegCache* cache = &comm->regCache; - int slot; - int saveDev; - if (handle == NULL) goto exit; - CUDACHECK(cudaGetDevice(&saveDev)); - CUDACHECK(cudaSetDevice(comm->cudaDev)); - for (slot=0; slotpopulation && cache->slots[slot] != reg; slot++); - if (slot == cache->population) { - WARN("Deregister: Could not find handle"); - return ncclInvalidUsage; - } - if (--reg->refs) return ncclSuccess; - NCCLCHECK(ncclNetDeregister(comm, reg)); - if (reg->state & NVLS_REG_COMPLETE) { - NCCLCHECK(ncclNvlsDeregBuffer(®->mcHandle, reg->regAddr, reg->dev, reg->regSize)); - reg->regAddr = (CUdeviceptr)NULL; - } - if (reg->state & COLLNET_REG_COMPLETE) { - NCCLCHECK(ncclCollnetDeregBuffer(comm, reg->collnetProxyconn, reg->collnetHandle)); - } - if (reg->state & IPC_REG_COMPLETE) { - for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; ++i) - if (reg->ipcInfos[i]) - NCCLCHECK(ncclIpcDeregBuffer(comm, reg->ipcInfos[i])); - if (reg->regIpcAddrs.hostPeerRmtAddrs) free(reg->regIpcAddrs.hostPeerRmtAddrs); - if (reg->regIpcAddrs.devPeerRmtAddrs) NCCLCHECK(ncclCudaFree(reg->regIpcAddrs.devPeerRmtAddrs)); - } - free(reg); - memmove(cache->slots+slot, cache->slots+slot+1, (cache->population-slot-1)*sizeof(struct ncclReg*)); - cache->population -= 1; - CUDACHECK(cudaSetDevice(saveDev)); -exit: - return ncclSuccess; -} diff --git a/src/register/coll_reg.cc b/src/register/coll_reg.cc new file mode 100644 index 000000000..4282dc9c8 --- /dev/null +++ b/src/register/coll_reg.cc @@ -0,0 +1,446 @@ +#include "register.h" +#include "transport.h" +#include "enqueue.h" + +static ncclResult_t registerCheckP2PConnection(struct ncclComm* comm, struct ncclConnector* conn, struct ncclTopoGraph* graph, int peer, bool* needReg) { + if (conn->connected) { + if (conn->conn.flags & (NCCL_P2P_READ | NCCL_P2P_WRITE)) { + *needReg = true; + } else { + // network connection + *needReg = false; + } + } else { + struct ncclPeerInfo* peerInfo = &comm->peerInfo[peer]; + struct ncclPeerInfo* myInfo = &comm->peerInfo[comm->rank]; + int canConnect = 0; + NCCLCHECK(ncclTransports[0]->canConnect(&canConnect, comm, graph, myInfo, peerInfo)); + if (canConnect) { + *needReg = true; + } else { + *needReg = false; + } + } + return ncclSuccess; +} + +ncclResult_t ncclRegisterCollNvlsBuffers( + struct ncclComm* comm, struct ncclTaskColl* info, + void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], + void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], + struct ncclIntruQueue* cleanupQueue, + bool* regNeedConnect + ) { + ncclResult_t result = ncclSuccess; + + info->regBufType = NCCL_REGULAR_BUFFER; + *regNeedConnect = true; + if (!(ncclParamLocalRegister() || (comm->planner.persistent && ncclParamGraphRegister()))) goto exit; +#if CUDART_VERSION >= 11030 + if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) { + if (!comm->nvlsRegSupport || info->opDev.op == ncclDevPreMulSum) goto exit; + int nvlsReged = 0; + int collnetReged = 0; + const void *sendbuff = info->sendbuff; + void *recvbuff = info->recvbuff; + void *recvHandle = NULL, *sendHandle = NULL; + if (info->func == ncclFuncAllGather) sendbuff = NULL; + if (info->func == ncclFuncReduceScatter) recvbuff = NULL; + size_t elementSize = ncclTypeSize(info->datatype); + size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count); + size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count); + + /* first try graph registration. */ + if (comm->planner.persistent && ncclParamGraphRegister()) { + ncclNvlsGraphRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &nvlsReged, outRegBufSend, outRegBufRecv, cleanupQueue, &info->nCleanupQueueElts); + } + + if (nvlsReged == 0 && ncclParamLocalRegister()) { + ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &nvlsReged, outRegBufSend, outRegBufRecv); + } + + if (nvlsReged && comm->nNodes > 1 && info->algorithm == NCCL_ALGO_NVLS) { + if (comm->planner.persistent && ncclParamGraphRegister()) { + ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle, cleanupQueue, &info->nCleanupQueueElts); + if (collnetReged) ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle, cleanupQueue, &info->nCleanupQueueElts); + } + + if (collnetReged == 0 && ncclParamLocalRegister()) { + ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle); + if (collnetReged) ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle); + } + } + + if (nvlsReged) { + *regNeedConnect = 0; + /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to + * saturate bandwidth. */ + if (comm->nNodes == 1) { + if (info->func == ncclFuncReduceScatter) + info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5)); + else + info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4)); + } else { + info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6)); + } + info->regBufType |= NCCL_NVLS_REG_BUFFER; + } + + if (collnetReged) { + info->regBufType |= NCCL_NET_REG_BUFFER; + info->sendMhandle = sendHandle; + info->recvMhandle = recvHandle; + } + } +exit: +#endif + return result; +} + +ncclResult_t ncclRegisterCollBuffers( + struct ncclComm* comm, struct ncclTaskColl* info, + void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], + void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], + struct ncclIntruQueue* cleanupQueue, + bool* regNeedConnect + ) { + ncclResult_t result = ncclSuccess; + + info->regBufType = NCCL_REGULAR_BUFFER; + *regNeedConnect = true; + if (!(ncclParamLocalRegister() || (comm->planner.persistent && ncclParamGraphRegister()))) goto exit; +#if CUDART_VERSION >= 11030 + if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) { + /* this part of nvls reg code is temporarily not used and obsolete. */ + if (!comm->nvlsRegSupport || info->opDev.op == ncclDevPreMulSum) goto exit; + int nvlsReged = 0; + int collnetReged = 0; + const void *sendbuff = info->sendbuff; + void *recvbuff = info->recvbuff; + void *recvHandle = NULL, *sendHandle = NULL; + if (info->func == ncclFuncAllGather) sendbuff = NULL; + if (info->func == ncclFuncReduceScatter) recvbuff = NULL; + size_t elementSize = ncclTypeSize(info->datatype); + size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count); + size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count); + + /* first try local registration. */ + if (ncclParamLocalRegister()) { + ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &nvlsReged, outRegBufSend, outRegBufRecv); + } + + if (nvlsReged == 0 && comm->planner.persistent && ncclParamGraphRegister()) { + ncclNvlsGraphRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &nvlsReged, outRegBufSend, outRegBufRecv, cleanupQueue, &info->nCleanupQueueElts); + } + + if (comm->nNodes > 1 && info->algorithm == NCCL_ALGO_NVLS) { + if (ncclParamLocalRegister()) { + ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle); + if (collnetReged) ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle); + } + + if (collnetReged == 0 && comm->planner.persistent && ncclParamGraphRegister()) { + ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle, cleanupQueue, &info->nCleanupQueueElts); + if (collnetReged) ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle, cleanupQueue, &info->nCleanupQueueElts); + } + } + + if (nvlsReged) { + *regNeedConnect = 0; + /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to + * saturate bandwidth. */ + if (comm->nNodes == 1) { + if (info->func == ncclFuncReduceScatter) + info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5)); + else + info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4)); + } else { + info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6)); + } + info->regBufType |= NCCL_NVLS_REG_BUFFER; + } + + if (collnetReged) { + info->regBufType |= NCCL_NET_REG_BUFFER; + info->sendMhandle = sendHandle; + info->recvMhandle = recvHandle; + } + } else if (info->protocol == NCCL_PROTO_SIMPLE) { + // IPC buffer registration + if (info->func == ncclFuncReduceScatter && info->algorithm != NCCL_ALGO_COLLNET_DIRECT) goto exit; + if (info->algorithm == NCCL_ALGO_RING && ((info->func == ncclFuncAllReduce && info->sendbuff == info->recvbuff) || info->func == ncclFuncReduce)) goto exit; + if ((info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && info->sendbuff == info->recvbuff) goto exit; + if (info->func == ncclFuncAllGather && info->algorithm == NCCL_ALGO_PAT) goto exit; + + int peerRanks[NCCL_MAX_LOCAL_RANKS]; + int nPeers = 0; + size_t elementSize = ncclTypeSize(info->datatype); + size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count); + size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count); + int regBufFlag = 0; + memset(peerRanks, 0xff, sizeof(int) * NCCL_MAX_LOCAL_RANKS); + + if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) { + struct ncclChannel* channel = comm->channels; + int ipcRegFlag = 0, netSendRegFlag = 0, netRecvRegFlag = 0; + void *sendHandle, *recvHandle; + if (info->func != ncclFuncReduceScatter && comm->intraNodeP2pSupport) { + for (int r = 0; r < NCCL_MAX_DIRECT_ARITY; ++r) { + for (int down = 0; down < 2; ++down) { + int peer = down ? channel->collnetDirect.down[r] : channel->collnetDirect.up[r]; + if (peer != -1) { + struct ncclConnector* peerConn = &channel->peers[peer]->recv[0]; + bool needReg = false; + + NCCLCHECK(registerCheckP2PConnection(comm, peerConn, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], peer, &needReg)); + if (needReg) { + bool found = false; + for (int p = 0; p < nPeers; ++p) { + if (peerRanks[p] == peer) { + found = true; + break; + } + } + if (!found) peerRanks[nPeers++] = peer; + } + } + } + } + + if (nPeers > 0) { + if (comm->planner.persistent && ncclParamGraphRegister()) { + ncclIpcGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &ipcRegFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts); + if (ipcRegFlag) ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &ipcRegFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts); + } + if (!ipcRegFlag && ncclParamLocalRegister()) { + ncclIpcLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &ipcRegFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs); + if (ipcRegFlag) ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &ipcRegFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs); + } + } + if (ipcRegFlag) { + info->regBufType |= NCCL_IPC_REG_BUFFER; + } + } + + // register collnet buffer + if (info->opDev.op != ncclDevPreMulSum && info->opDev.op != ncclDevSumPostDiv && !(info->func == ncclFuncAllReduce && !comm->isOneRPN)) { + if (comm->planner.persistent && ncclParamGraphRegister()) { + ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &netSendRegFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts); + info->sendMhandle = sendHandle; + if (netSendRegFlag) { + ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &netRecvRegFlag, &recvHandle, cleanupQueue, &info->nCleanupQueueElts); + info->recvMhandle = recvHandle; + } + } + + if ((netSendRegFlag == 0 || netRecvRegFlag == 0) && ncclParamLocalRegister()) { + if (!netSendRegFlag) { + ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &netSendRegFlag, &sendHandle); + info->sendMhandle = sendHandle; + } + if (netSendRegFlag && !netRecvRegFlag) { + ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &netRecvRegFlag, &recvHandle); + info->recvMhandle = recvHandle; + } + } + } + + if (netSendRegFlag && netRecvRegFlag) { + if (comm->isOneRPN) info->nMaxChannels = 1; + info->regBufType |= NCCL_NET_REG_BUFFER; + } + } else if (info->algorithm == NCCL_ALGO_RING) { + struct ncclReg* recvRegRecord = NULL; + struct ncclReg* sendRegRecord = NULL; + int sendNetPeers = comm->nChannels; + int recvNetPeers = comm->nChannels; + struct ncclConnector** sendNetConns = NULL; + struct ncclConnector** recvNetConns = NULL; + void** sendNetHandles = NULL; + void** recvNetHandles = NULL; + void** srecvNetHandles = NULL; + bool hasRecvNetPeer = false; + bool hasSendNetPeer = false; + + NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord)); + if (recvRegRecord == NULL && !(comm->planner.persistent && ncclParamGraphRegister())) goto exit; + NCCLCHECK(ncclRegFind(comm, info->sendbuff, sendbuffSize, &sendRegRecord)); + if (sendRegRecord == NULL && !(comm->planner.persistent && ncclParamGraphRegister())) goto exit; + NCCLCHECK(ncclCalloc(&sendNetConns, comm->nChannels)); + NCCLCHECK(ncclCalloc(&sendNetHandles, comm->nChannels)); + NCCLCHECK(ncclCalloc(&recvNetConns, comm->nChannels)); + NCCLCHECK(ncclCalloc(&recvNetHandles, comm->nChannels)); + NCCLCHECK(ncclCalloc(&srecvNetHandles, comm->nChannels)); + + for (int c = 0; c < comm->nChannels; ++c) { + struct ncclChannel* channel = comm->channels + c; + for (int r = 0; r < 2; ++r) { + int peer; + struct ncclConnector* peerConn; + if (r == 0) { + peer = channel->ring.prev; + peerConn = &channel->peers[peer]->recv[0]; + if (peerConn->conn.flags & NCCL_DIRECT_NIC) { + recvNetConns[c] = peerConn; + hasRecvNetPeer = true; + } + } else { + peer = channel->ring.next; + peerConn = &channel->peers[peer]->send[0]; + if (peerConn->conn.flags & NCCL_DIRECT_NIC) { + sendNetConns[c] = peerConn; + hasSendNetPeer = true; + } + } + if (peerConn->conn.flags & (NCCL_P2P_READ | NCCL_P2P_WRITE)) { + bool found = false; + for (int p = 0; p < nPeers; ++p) { + if (peerRanks[p] == peer) { + found = true; + break; + } + } + if (!found) peerRanks[nPeers++] = peer; + } + } + } + if (nPeers > 0 && comm->intraNodeP2pSupport) { + if (comm->planner.persistent && ncclParamGraphRegister()) { + ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts); + } + if (!regBufFlag && ncclParamLocalRegister()) { + ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs); + } + } + if (regBufFlag) { + info->regBufType = NCCL_IPC_REG_BUFFER; + } + + // start net registration + regBufFlag = 0; + if (!comm->useNetPXN && comm->useGdr && comm->netDeviceType != NCCL_NET_DEVICE_UNPACK) { + if (comm->planner.persistent && ncclParamGraphRegister()) { + if (hasSendNetPeer) { + ncclNetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, sendNetConns, sendNetPeers, ®BufFlag, sendNetHandles, cleanupQueue, &info->nCleanupQueueElts); + if (regBufFlag) + ncclNetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, sendNetConns, sendNetPeers, ®BufFlag, srecvNetHandles, cleanupQueue, &info->nCleanupQueueElts); + } + if ((regBufFlag || !hasSendNetPeer) && hasRecvNetPeer) + ncclNetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, recvNetConns, recvNetPeers, ®BufFlag, recvNetHandles, cleanupQueue, &info->nCleanupQueueElts); + } + if (!regBufFlag && ncclParamLocalRegister()) { + if (hasSendNetPeer) { + ncclNetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, sendNetConns, sendNetPeers, ®BufFlag, sendNetHandles); + if (regBufFlag) + ncclNetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, sendNetConns, sendNetPeers, ®BufFlag, srecvNetHandles); + } + if ((regBufFlag || !hasSendNetPeer) && hasRecvNetPeer) + ncclNetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, recvNetConns, recvNetPeers, ®BufFlag, recvNetHandles); + } + } + + if (regBufFlag) { + info->regBufType |= NCCL_NET_REG_BUFFER; + info->sendNetHandles = sendNetHandles; + info->recvNetHandles = recvNetHandles; + info->srecvNetHandles = srecvNetHandles; + if (comm->isOneRPN && (info->func == ncclFuncAllGather || info->func == ncclFuncBroadcast)) { + info->nMaxChannels = 1; + } + } else { + free(sendNetHandles); + free(recvNetHandles); + free(srecvNetHandles); + } + + free(sendNetConns); + free(recvNetConns); + } else if (info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) { + struct ncclReg* recvRegRecord; + int netSendRegFlag = 0, netRecvRegFlag = 0; + void *sendHandle, *recvHandle; + NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord)); + if (recvRegRecord == NULL && !(comm->planner.persistent && ncclParamGraphRegister())) goto exit; + if (comm->intraNodeP2pSupport) { + for (int c = 0; c < comm->nChannels; ++c) { + struct ncclChannel* channel = comm->channels + c; + struct ncclTree* tree = NULL; + int peers[NCCL_MAX_TREE_ARITY + 1]; + + if (info->algorithm == NCCL_ALGO_TREE) + tree = &channel->tree; + else + tree = &channel->collnetChain; + for (int p = 0; p < NCCL_MAX_TREE_ARITY; ++p) peers[p] = tree->down[p]; + peers[NCCL_MAX_TREE_ARITY] = tree->up; + for (int p = 0; p < NCCL_MAX_TREE_ARITY + 1; ++p) { + int peer = peers[p]; + bool peerNeedReg = false; + struct ncclConnector* recvConn = NULL; + // P2P transport + if (peer == -1 || peer == comm->nRanks) continue; + recvConn = &channel->peers[peer]->recv[0]; + NCCLCHECK(registerCheckP2PConnection(comm, recvConn, &comm->graphs[info->algorithm], peer, &peerNeedReg)); + + if (peerNeedReg) { + bool found = false; + for (int pindex = 0; pindex < nPeers; ++pindex) { + if (peerRanks[pindex] == peer) { + found = true; + break; + } + } + if (!found) peerRanks[nPeers++] = peer; + } + } + } + if (nPeers > 0) { + if (comm->planner.persistent && ncclParamGraphRegister()) { + ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts); + } + if (!regBufFlag && ncclParamLocalRegister()) { + ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs); + } + } + if (regBufFlag) { + info->regBufType = NCCL_IPC_REG_BUFFER; + } + } + + // register collnet chain 1RPN buffer + if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN && info->opDev.op != ncclDevPreMulSum && info->opDev.op != ncclDevSumPostDiv && comm->isOneRPN) { + if (comm->planner.persistent && ncclParamGraphRegister()) { + ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &netSendRegFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts); + info->sendMhandle = sendHandle; + if (netSendRegFlag) { + ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &netRecvRegFlag, &recvHandle, cleanupQueue, &info->nCleanupQueueElts); + info->recvMhandle = recvHandle; + } + } + + if ((netSendRegFlag == 0 || netRecvRegFlag == 0) && ncclParamLocalRegister()) { + if (!netSendRegFlag) { + ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &netSendRegFlag, &sendHandle); + info->sendMhandle = sendHandle; + } + if (netSendRegFlag && !netRecvRegFlag) { + ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &netRecvRegFlag, &recvHandle); + info->recvMhandle = recvHandle; + } + } + } + + if (netSendRegFlag && netRecvRegFlag) { + if (comm->isOneRPN) info->nMaxChannels = 1; + info->regBufType |= NCCL_NET_REG_BUFFER; + } + } + + if (info->regBufType == NCCL_IPC_REG_BUFFER && comm->nNodes == 1 && 16 < info->nMaxChannels && info->nMaxChannels <= 24) { + info->nMaxChannels = 16; + } + } +exit: +#endif + return result; +} diff --git a/src/register/register.cc b/src/register/register.cc new file mode 100644 index 000000000..9e8f6eaaf --- /dev/null +++ b/src/register/register.cc @@ -0,0 +1,179 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "argcheck.h" // Need some checks here since we access comm +#include "nccl.h" +#include "comm.h" +#include "net.h" +#include "register.h" +#include "transport.h" + +ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg) { + struct ncclRegCache* cache = &comm->regCache; + uintptr_t pageSize = cache->pageSize; + uintptr_t addr = (uintptr_t)data & -pageSize; + size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize; + + *reg = NULL; + for (int slot=0; /*true*/; slot++) { + if (slot == cache->population || addr < cache->slots[slot]->addr) return ncclSuccess; + if ((addr >= cache->slots[slot]->addr) && + ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) { + *reg = cache->slots[slot]; + return ncclSuccess; + } + } +} +NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1); + +ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid) { + if (reg && isValid) { + if (reg->localRefs) + *isValid = true; + else + *isValid = false; + } + return ncclSuccess; +} + +ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, bool isGraph, void** handle) { + NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm")); + struct ncclRegCache* cache = &comm->regCache; + uintptr_t pageSize = cache->pageSize; + uintptr_t addr = (uintptr_t)data & -pageSize; + size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize; + + if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(data, comm, "buff", "ncclCommRegister")); + INFO(NCCL_REG, "register comm %p buffer %p size %zi", comm, data, size); + + for (int slot=0; /*true*/; slot++) { + if ((slot == cache->population) || (addr < cache->slots[slot]->addr)) { + if (cache->population == cache->capacity) { // must grow cache + cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity; + NCCLCHECK(ncclRealloc(&cache->slots, cache->population, cache->capacity)); + } + memmove(cache->slots+slot+1, cache->slots+slot, (cache->population-slot)*sizeof(struct ncclReg*)); + NCCLCHECK(ncclCalloc(cache->slots+slot, 1)); + struct ncclReg* regSlot = cache->slots[slot]; + regSlot->addr = addr; + regSlot->pages = pages; + if (isGraph) regSlot->graphRefs = 1; + else regSlot->localRefs = 1; + cache->population += 1; + *handle = regSlot; + goto exit; + } else if ((addr >= cache->slots[slot]->addr) && + ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) { + if (isGraph) cache->slots[slot]->graphRefs++; + else cache->slots[slot]->localRefs++; + *handle = cache->slots[slot]; + goto exit; + } + } + +exit: + return ncclSuccess; +} + +static ncclResult_t regCleanup(struct ncclComm* comm, struct ncclReg* reg) { + if (reg->state & NET_REG_COMPLETE) { + struct ncclRegNetHandles* netHandle = reg->netHandleHead; + struct ncclRegNetHandles* netHandlePrev; + while(netHandle) { + if (ncclNetDeregBuffer(comm, netHandle->proxyConn, netHandle->handle) != ncclSuccess) { + WARN("rank %d deregister NET buffer handle %p proxy rank %d failed\n", comm->rank, netHandle->handle, netHandle->proxyConn->rank); + } + netHandlePrev = netHandle; + netHandle = netHandle->next; + free(netHandlePrev); + } + } + if (reg->state & NVLS_REG_COMPLETE) { + if (ncclNvlsDeregBuffer(comm, ®->mcHandle, reg->regAddr, reg->dev, reg->regSize) != ncclSuccess) { + WARN("rank %d deregister NVLS buffer %p dev %d size %ld failed", comm->rank, (void*)reg->regAddr, reg->dev, reg->regSize); + } + reg->regAddr = (CUdeviceptr)NULL; + } + if (reg->state & COLLNET_REG_COMPLETE) { + if (ncclCollnetDeregBuffer(comm, reg->collnetProxyconn, reg->collnetHandle) != ncclSuccess) { + WARN("rank %d deregister COLLNET buffer handle %p proxy rank %d failed", comm->rank, reg->collnetHandle, reg->collnetProxyconn->rank); + } + } + if (reg->state & IPC_REG_COMPLETE) { + for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; ++i) + if (reg->ipcInfos[i]) { + if (ncclIpcDeregBuffer(comm, reg->ipcInfos[i]) != ncclSuccess) { + WARN("rank %d deregister IPC buffer %p peerRank %d failed", comm->rank, reg->ipcInfos[i]->baseAddr, reg->ipcInfos[i]->peerRank); + } + free(reg->ipcInfos[i]); + } + if (reg->regIpcAddrs.hostPeerRmtAddrs) free(reg->regIpcAddrs.hostPeerRmtAddrs); + if (reg->regIpcAddrs.devPeerRmtAddrs) NCCLCHECK(ncclCudaFree(reg->regIpcAddrs.devPeerRmtAddrs)); + } + return ncclSuccess; +} + +ncclResult_t ncclRegCleanup(struct ncclComm* comm) { + struct ncclRegCache* cache = &comm->regCache; + for (int i = 0; i < cache->population; i++) { + struct ncclReg* reg = cache->slots[i]; + INFO(NCCL_INIT, "Cleanup buffer %p pages %lx", (void*)reg->addr, reg->pages); + NCCLCHECK(regCleanup(comm, reg)); + free(reg); + } + free(cache->slots); + return ncclSuccess; +} + +NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle); +ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) { + if (!ncclParamLocalRegister()) + *handle = NULL; + else + NCCLCHECK(ncclRegister(comm, buff, size, false, handle)); + return ncclSuccess; +} + +ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) { + NCCLCHECK(ncclRegister(comm, buff, size, true, handle)); + return ncclSuccess; +} + +static ncclResult_t commDeregister(struct ncclComm *comm, bool isGraph, struct ncclReg* reg) { + NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm")); + struct ncclRegCache* cache = &comm->regCache; + int slot; + int saveDev; + if (reg == NULL) goto exit; + CUDACHECK(cudaGetDevice(&saveDev)); + CUDACHECK(cudaSetDevice(comm->cudaDev)); + for (slot = 0; slot < cache->population && cache->slots[slot] != reg; slot++); + if (slot == cache->population) { + WARN("Deregister: Could not find handle"); + return ncclInvalidUsage; + } + if (isGraph) --reg->graphRefs; + else --reg->localRefs; + if (reg->localRefs || reg->graphRefs) return ncclSuccess; + NCCLCHECK(regCleanup(comm, reg)); + free(reg); + memmove(cache->slots + slot, cache->slots + slot + 1, (cache->population - slot - 1) * sizeof(struct ncclReg*)); + cache->population -= 1; + CUDACHECK(cudaSetDevice(saveDev)); +exit: + return ncclSuccess; +} + +NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle); +ncclResult_t ncclCommDeregister(const ncclComm_t comm, void *handle) { + NCCLCHECK(commDeregister(comm, false, (struct ncclReg*)handle)); + return ncclSuccess; +} + +ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle) { + NCCLCHECK(commDeregister(comm, true, handle)); + return ncclSuccess; +} diff --git a/src/register/sendrecv_reg.cc b/src/register/sendrecv_reg.cc new file mode 100644 index 000000000..f82fbd714 --- /dev/null +++ b/src/register/sendrecv_reg.cc @@ -0,0 +1,35 @@ +#include "register.h" +#include "transport.h" + +ncclResult_t ncclRegisterP2pNetBuffer(struct ncclComm* comm, void* userbuff, size_t size, struct ncclConnector* conn, int* regFlag, void** handle, struct ncclIntruQueue* cleanupQueue) { + ncclResult_t ret = ncclSuccess; + + *regFlag = 0; + if (comm->netDeviceType != NCCL_NET_DEVICE_UNPACK) { + if (comm->planner.persistent && ncclParamGraphRegister()) { + ncclNetGraphRegisterBuffer(comm, userbuff, size, &conn, 1, regFlag, handle, cleanupQueue, NULL); + } + if (*regFlag == 0 && ncclParamLocalRegister()) { + ncclNetLocalRegisterBuffer(comm, userbuff, size, &conn, 1, regFlag, handle); + } + } + return ret; +} + +ncclResult_t ncclRegisterP2pIpcBuffer(struct ncclComm* comm, void* userbuff, size_t size, int peerRank, int* regFlag, void** regAddr, struct ncclIntruQueue* cleanupQueue) { + ncclResult_t ret = ncclSuccess; + uintptr_t offset = 0; + uintptr_t* peerRmtAddrs = NULL; + + *regFlag = 0; + if (comm->planner.persistent && ncclParamGraphRegister()) { + ncclIpcGraphRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs, reinterpret_cast(cleanupQueue), NULL); + } + if (*regFlag == 0 && ncclParamLocalRegister()) { + ncclIpcLocalRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs); + } + + if (*regFlag) + *regAddr = (void*)((uintptr_t)peerRmtAddrs + offset); + return ret; +} diff --git a/src/transport.cc b/src/transport.cc index eeee7a24b..5629ce7a2 100644 --- a/src/transport.cc +++ b/src/transport.cc @@ -94,13 +94,13 @@ ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2p } *intraNodeP2pSupport = supportFlag; *directMode = directFlag; + if (comm->rank == 0) INFO(NCCL_INIT, "Check P2P Type intraNodeP2pSupport %d directMode %d", supportFlag, directFlag); return ncclSuccess; } -ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) { +ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex) { // Stream used during transport setup; need for P2P pre-connect + CUDA Graph ncclResult_t ret = ncclSuccess; - int highestType = TRANSPORT_UNDEFINED; // track highest transport type struct ncclConnect** data; // Store intermediate send/recvData structs for connect struct ncclConnect** recvData = NULL; // Points to entries inside data for given recv connection within a channel struct ncclConnect** sendData = NULL; // Points to entries inside data for given send connection within a channel @@ -131,7 +131,10 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* // The next M entries contain sendData, connection information for send connections // It's not guaranteed that each entry of data has the same number of total or send/recv specific connections int p = i-(done+1); - if (recvMask || sendMask) NCCLCHECKGOTO(ncclCalloc(data+p, 2*MAXCHANNELS), ret, fail); + if (recvMask || sendMask) { + if (data[p] == NULL) NCCLCHECKGOTO(ncclCalloc(data + p, 2 * MAXCHANNELS), ret, fail); + else memset(data[p], 0, 2 * MAXCHANNELS * sizeof(struct ncclConnect)); + } recvData[p] = data[p]; int sendChannels = 0, recvChannels = 0; int type; @@ -139,7 +142,6 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* for (int c=0; c(comm, graph, recvData[p]+recvChannels++, c, recvPeer, connIndex, &type), ret, fail); - if (type > highestType) highestType = type; } } TIME_STOP(0); @@ -148,7 +150,6 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* for (int c=0; c(comm, graph, sendData[p]+sendChannels++, c, sendPeer, connIndex, &type), ret, fail); - if (type > highestType) highestType = type; } } TIME_STOP(1); @@ -222,22 +223,18 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* } TIME_STOP(4); } - if (sendMask || recvMask) { - free(data[p]); - data[p] = NULL; - } } - if (ncclParamReportConnectProgress() && comm->rank == 0 && done > 0) { + if (ncclParamReportConnectProgress() && comm->rank == 0 && done > 0) { struct timeval now; gettimeofday(&now, NULL); - if (((now.tv_sec - timeLast.tv_sec)*1.0 + (now.tv_usec-timeLast.tv_usec)*1e-6) > 1) { - float elapsed = (now.tv_sec - timeStart.tv_sec)*1.0 + (now.tv_usec-timeStart.tv_usec)*1e-6; - float remaining = elapsed*(comm->nRanks-done)/done; + if (((now.tv_sec - timeLast.tv_sec) * 1.0 + (now.tv_usec - timeLast.tv_usec) * 1e-6) > 1) { + float elapsed = (now.tv_sec - timeStart.tv_sec) * 1.0 + (now.tv_usec - timeStart.tv_usec) * 1e-6; + float remaining = elapsed * (comm->nRanks - done) / done; printf("%sP2p connect: %g%% Elapsed %d:%02d Remaining %d:%02d ", - timeReported ? "\r" : "", done*100.0/comm->nRanks, ((int)elapsed)/60, ((int)elapsed)%60, ((int)remaining)/60, ((int)remaining)%60); + timeReported ? "\r" : "", done * 100.0 / comm->nRanks, ((int)elapsed) / 60, ((int)elapsed) % 60, ((int)remaining) / 60, ((int)remaining) % 60); fflush(stdout); timeReported = true; - timeLast = now; // struct copy; + timeLast = now; // struct copy; } } } @@ -280,7 +277,6 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL; } - if (highestTransportType != NULL) *highestTransportType = highestType; TIME_PRINT("P2P Setup/Connect"); exit: for(int i=0; irank, graph, channelId, -1, &netId, &req.netDev, &proxyRank)); - NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr)); + NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 1, &req.useGdr)); send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; send->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; @@ -177,10 +179,10 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph int proxyRank; int64_t netId; NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank)); - NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr)); + NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 0, &req.useGdr)); recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; // Determine whether we need to flush the GDR buffer on recv or not - if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush)); + if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, req.netDev, myInfo->rank, &req.needFlush)); recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn)); @@ -319,6 +321,13 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc connection->collNet = req->collNet; /* DMA-BUF support */ resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); + /* collective size limits*/ + resources->maxCollBytes = props.maxCollBytes; + if((resources->maxCollBytes <= 0) || (resources->maxCollBytes > NCCL_MAX_NET_SIZE_BYTES)) { + WARN("sendProxySetup: collnet plugin returned invalid value for maxCollBytes %ld \ + [allowed range: %ld - %ld] \n", resources->maxCollBytes, 0L, NCCL_MAX_NET_SIZE_BYTES); + return ncclInternalError; + } return ncclSuccess; } @@ -430,6 +439,12 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc connection->collNet = req->collNet; /* DMA-BUF support */ resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF); + resources->maxCollBytes = props.maxCollBytes; + if((resources->maxCollBytes <= 0) || (resources->maxCollBytes > NCCL_MAX_NET_SIZE_BYTES)) { + WARN("sendProxySetup: collnet plugin returned invalid value for maxCollBytes %ld \ + [allowed range: %ld - %ld] \n", resources->maxCollBytes, 0L, NCCL_MAX_NET_SIZE_BYTES); + return ncclInternalError; + } collNetHandle_t* netHandle = (collNetHandle_t*) respBuff; if (respSize != sizeof(collNetHandle_t)) return ncclInternalError; @@ -645,14 +660,14 @@ static size_t calcAlgoOffset(struct ncclProxyArgs* args, int isAllNotOne, int su return offset; } -static int calcRegionOffset( +static ssize_t calcRegionOffset( struct ncclProxyArgs* args, int isRecvNotSend, int sub, uint64_t step, int side // 0=begin, 1=end ) { struct ncclCollNetSharedRes* collNet = args->subs[0].connection->collNet; - int slotSize = collNet->buffSize/NCCL_STEPS; - int chunkSize = args->chunkSize; - int base = isRecvNotSend*NCCL_STEPS + (step%NCCL_STEPS); + ssize_t slotSize = collNet->buffSize/NCCL_STEPS; + ssize_t chunkSize = args->chunkSize; + ssize_t base = isRecvNotSend*NCCL_STEPS + (step%NCCL_STEPS); base *= collNet->nChannels*slotSize; if (args->coll == ncclFuncAllReduce) { return base + (sub+side)*chunkSize; @@ -674,6 +689,165 @@ static constexpr int calcStepsPerGroup(int nGroups) { return NCCL_STEPS; } +static ncclResult_t collNetRegIallreduce(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, int groupStart, ssize_t *nBytesInOut, void **request) { + ssize_t loopSize, winOffset, nBytes; + ssize_t eltSize = ncclTypeSize((ncclDataType_t)args->dtype); + // for UB iallreduce 1RPN case, user's send and recv buffers are both directly accessed by collnet network. + // we can just issue maximal collnet bytes by resources->maxCollBytes for each iallreduce. + // for multi-RPN case, we have to consider pipeline, so each time we only send groupSize * chunkSize (i.e., nBytesInOut) + // sub->loopOffset is data offset to the buffer for this head rank in each loop + // winOffset is used to find actual offset from send and recv buffer for this iallreduce + // loopSize is all bytes sent by all channels and head ranks in each loop. + // send and recv mem handle are retrieved from sub in which user buffer mem handles are stored. + if (sub->isOneRPN) { + winOffset = 0; + nBytes = std::min((size_t)sub->nbytes, resources->maxCollBytes); + loopSize = nBytes; + } else { + winOffset = sub->loopOffset + groupStart * args->chunkSize; + nBytes = std::min(sub->nbytes - winOffset, *nBytesInOut); + loopSize = sub->loopSize; + } + + if (nBytes > 0) { + NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, sub->sendbuff + winOffset, sub->recvbuff + winOffset, nBytes / eltSize, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sub->sendMhandle, sub->recvMhandle, request)); + if (*request) { + // if issued successfully, we need to move the pointer forward and reduce the existing nbytes. + sub->nbytes -= loopSize; + sub->sendbuff += loopSize; + sub->recvbuff += loopSize; + TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] registered Iallreduce posted sendbuff %p recvbuff %p size %ld loopSize %ld winOffset %ld isOneRPN %d req %p", (long)sub->transmitted, sub->nsteps, groupStart, sub->sendbuff, sub->recvbuff, nBytes, loopSize, winOffset, sub->isOneRPN, *request); + } + } + *nBytesInOut = nBytes; + return ncclSuccess; +} + +static ncclResult_t collNetIallreduce(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t sendBeg, ssize_t recvBeg, void **request) { + void *sendMhandle = resources->sendMhandles[NCCL_PROTO_SIMPLE]; + void *recvMhandle = resources->recvMhandles[NCCL_PROTO_SIMPLE]; + char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]); + ssize_t eltSize = ncclTypeSize((ncclDataType_t)args->dtype); + // non-UB iallreduce, region is intermediate buffer and sendBeg/recvBeg is the corresponding offset + // for send and recv data. The send and recv mem handle are retrieved from resources. + NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, region + sendBeg, region + recvBeg, nBytes / eltSize, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, request)); + if (*request) + TRACE(NCCL_NET, "sendProxy [%ld/%d] Iallreduce posted size %ld sendBeg %ld recvBeg %ld req %p", (long)sub->transmitted, sub->nsteps, nBytes, sendBeg, recvBeg, *request); + return ncclSuccess; +} + +static ncclResult_t collNetRegIallgather(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytesIn, ssize_t allBeg, ssize_t recvBeg, void *recvMhandle, void **request) { + ncclNetSGE_v9_t recvParts; + ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank; + char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]); + ssize_t nBytes; + ssize_t winOffset; + void *sendbuff; + // UB iallgather 1RPN logic is the same as iallreduce. + // If iallgather is not 1RPN, we can let collnet network directly access sendbuff but not recvbuff; + // the main reason is non-1RPN case will cause non-contiguous recv data from network, so + // we have to use intermediate buffer "region" to recv data and copy into the recvbuff. + // so allBeg and recvMhandle, which are global window offset of recv buffer and mem handle for region, + // are only used in multi-RPN case. + if (sub->isOneRPN) { + nBytes = std::min((size_t)sub->nbytes, resources->maxCollBytes); + winOffset = sub->offset; + recvParts.mhandle = sub->recvMhandle; + recvParts.address = sub->recvbuff; + } else { + nBytes = nBytesIn; + winOffset = allBeg; + recvParts.mhandle = recvMhandle; + recvParts.address = region + recvBeg; + } + recvParts.size = nBytes; + if (winOffset / sizePerRank == args->specifics.collnetDirect.node) { + sendbuff = sub->sendbuff + winOffset % sizePerRank; + } else { + sendbuff = sub->sendbuff; + } + NCCLCHECK(proxyState->ncclCollNet->iallgather(resources->collNetComm, sendbuff, 1, &recvParts, sizePerRank, winOffset, nBytes, sub->sendMhandle, request)); + if (*request) { + if (sub->isOneRPN) { + sub->recvbuff += nBytes; + sub->nbytes -= nBytes; + sub->offset += nBytes; + } + TRACE(NCCL_NET, "sendProxy [%ld/%d] registered Iallgather posted sizePerRank %ld winOffset %ld recvSize %ld isOneRPN %d request %p", sub->transmitted, sub->nsteps, sizePerRank, winOffset, nBytes, sub->isOneRPN, *request); + } + return ncclSuccess; +} + +static ncclResult_t collNetIallgather(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t allBeg, ssize_t sendBeg, ssize_t recvBeg, void *sendMhandle, void *recvMhandle, void **request) { + ncclNetSGE_v9_t recvParts; + ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank; + char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]); + recvParts.mhandle = recvMhandle; + recvParts.address = region + recvBeg; + recvParts.size = nBytes; + // non-UB iallgather, we use intermidate region buffers for both send and recv data. + // sendMhandle and recvMhandle are send and recv mem handles for region, and allBeg is + // the global window offset of recv buffer. sendBeg and recvBeg are offset to the region + // for intermediate data. + NCCLCHECK(proxyState->ncclCollNet->iallgather(resources->collNetComm, region + sendBeg, 1, &recvParts, sizePerRank, allBeg, nBytes, sendMhandle, request)); + if (*request) + TRACE(NCCL_NET, "sendProxy [%ld/%d] Iallgather posted sizePerRank %ld winOffset %ld recvSize %ld request %p", sub->transmitted, sub->nsteps, sizePerRank, allBeg, nBytes, *request); + return ncclSuccess; +} + +static ncclResult_t collNetRegIreducescatter(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytesIn, ssize_t allBeg, ssize_t sendBeg, void *sendMhandle, void **request) { + ncclNetSGE_v9_t sendParts; + ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank; + char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]); + ssize_t nBytes; + size_t winOffset; + void *recvbuff; + // Similar to iallgather, if ireducescatter is not 1RPN, we can let collnet network + // directly access recvbuff but not sendbuff. We use intermediate buffer "region" to + // send data and directly recv into the recvbuff. + if (sub->isOneRPN) { + nBytes = std::min((size_t)sub->nbytes, resources->maxCollBytes); + winOffset = sub->offset; + sendParts.mhandle = sub->sendMhandle; + sendParts.address = sub->sendbuff; + } else { + nBytes = nBytesIn; + winOffset = allBeg; + sendParts.mhandle = sendMhandle; + sendParts.address = region + sendBeg; + } + sendParts.size = nBytes; + if (winOffset / sizePerRank == args->specifics.collnetDirect.node) { + recvbuff = sub->recvbuff + winOffset % sizePerRank; + } else { + recvbuff = sub->recvbuff; + } + NCCLCHECK(proxyState->ncclCollNet->ireducescatter(resources->collNetComm, 1, &sendParts, recvbuff, sizePerRank, winOffset, nBytes, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sub->recvMhandle, request)); + if (*request) { + if (sub->isOneRPN) { + sub->sendbuff += nBytes; + sub->nbytes -= nBytes; + sub->offset += nBytes; + } + TRACE(NCCL_NET, "sendProxy [%ld/%d] registered Ireducescatter posted sizePerRank %ld winOffset %ld sendSize %ld isOneRPN %d request %p", sub->transmitted, sub->nsteps, sizePerRank, winOffset, nBytes, sub->isOneRPN, *request); + } + return ncclSuccess; +} + +static ncclResult_t collNetIreducescatter(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t allBeg, ssize_t sendBeg, ssize_t recvBeg, void *sendMhandle, void *recvMhandle, void **request) { + ncclNetSGE_v9_t sendParts; + ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank; + char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]); + sendParts.mhandle = sendMhandle; + sendParts.address = region + sendBeg; + sendParts.size = nBytes; + // non-UB ireducescatter is the same as non-UB iallgather but in the reverse direction. + NCCLCHECK(proxyState->ncclCollNet->ireducescatter(resources->collNetComm, 1, &sendParts, region + recvBeg, sizePerRank, allBeg, nBytes, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, recvMhandle, request)); + if (*request) + TRACE(NCCL_NET, "sendProxy [%ld/%d] Ireducescatter posted sizePerRank %ld winOffset %ld sendSize %ld request %p", sub->transmitted, sub->nsteps, sizePerRank, allBeg, nBytes, *request); + return ncclSuccess; +} + static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { @@ -683,6 +857,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->received = sub->transmitted = sub->done = 0; resources->step = sub->base + sub->nsteps; + //adjust nsteps for registerd buffers as device signals a single step + if (sub->reg && sub->isOneRPN) sub->nsteps = DIVUP((size_t)sub->nbytes, resources->maxCollBytes); } args->state = ncclProxyOpProgress; } @@ -695,28 +871,30 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources); void* sendMhandle = resources->sendMhandles[p]; void* recvMhandle = resources->recvMhandles[p]; - char* region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[p]); auto reqFifo = resources->reqFifo; int group = s/COLLNET_GROUP_NSUBS; int groupStart = s - (s%COLLNET_GROUP_NSUBS); if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) { int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; - if (sub->reg == 0) { + if (sub->reg == 0 || (!sub->isOneRPN && args->coll == ncclFuncReduceScatter)) { resources->recvMem->connFifo[buffSlot].offset = calcRegionOffset(args, 0, s, sub->posted, 0); __sync_synchronize(); } volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head; - TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] posted offset %d @ %p signal %ld->%ld", long(sub->posted), group, buffSlot, resources->recvMem->connFifo[buffSlot].offset, &resources->recvMem->connFifo[buffSlot].offset, long(*sendHead), long(sub->base + sub->posted + args->sliceSteps - NCCL_STEPS)); + TRACE(NCCL_NET, "sendProxy [%ld/%d/%d/%d] posted offset %d @ %p signal %ld->%ld", long(sub->posted), group, buffSlot, sub->nsteps, resources->recvMem->connFifo[buffSlot].offset, &resources->recvMem->connFifo[buffSlot].offset, long(*sendHead), long(sub->base + sub->posted + args->sliceSteps - NCCL_STEPS)); sub->posted += args->sliceSteps; - *sendHead = sub->base + sub->posted - NCCL_STEPS; + // Only post one credit for registered buffer + if (sub->reg == 0 || !sub->isOneRPN || sub->posted == args->sliceSteps) *sendHead = sub->base + sub->posted - NCCL_STEPS; if (resources->gdcSync) wc_store_fence(); // Flush out WC write } if (sub->received < sub->posted && sub->received < sub->done + calcStepsPerGroup(nGroups)) { int buffSlot = (sub->base+sub->received)%NCCL_STEPS; volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo; volatile uint64_t* recvTail = &resources->recvMem->tail; - if ((connFifo[buffSlot].size != -1 || sub->reg) && ((*recvTail > (sub->base+sub->received)))) { + //device progresses tail by only 1 for registered buffers + uint64_t tail = sub->base + (sub->reg && sub->isOneRPN ? 0 : sub->received); + if ((connFifo[buffSlot].size != -1 || sub->reg) && (*recvTail > tail)) { if (args->coll != ncclFuncAllReduce && sub->reg == 0) { int sendBeg = calcRegionOffset(args, 0, s, sub->received, 0); int sendEnd = calcRegionOffset(args, 0, s, sub->received, 1); @@ -738,110 +916,42 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS; if (!reqFifo[group][buffSlot].turnIsSendNotRecv) continue; - ssize_t sizePerRank = 0; - size_t allBeg = calcAlgoOffset(args, 1, groupStart, sub->transmitted); - size_t allEnd = calcAlgoOffset(args, 1, s+1, sub->transmitted); - int sendBeg = calcRegionOffset(args, 0, groupStart, sub->transmitted, 0); - int sendEnd = calcRegionOffset(args, 0, s, sub->transmitted, 1); - int recvBeg = calcRegionOffset(args, 1, groupStart, sub->transmitted, 0); - int recvEnd = calcRegionOffset(args, 1, s, sub->transmitted, 1); + ssize_t allBeg = calcAlgoOffset(args, 1, groupStart, sub->transmitted); + ssize_t allEnd = calcAlgoOffset(args, 1, s+1, sub->transmitted); + ssize_t sendBeg = calcRegionOffset(args, 0, groupStart, sub->transmitted, 0); + ssize_t sendEnd = calcRegionOffset(args, 0, s, sub->transmitted, 1); + ssize_t recvBeg = calcRegionOffset(args, 1, groupStart, sub->transmitted, 0); + ssize_t recvEnd = calcRegionOffset(args, 1, s, sub->transmitted, 1); reqFifo[group][buffSlot].size = recvEnd - recvBeg; - size_t eltSize = ncclTypeSize((ncclDataType_t)args->dtype); - if (sendBeg==sendEnd && recvBeg==recvEnd && sub->reg == 0) { + if (sendBeg==sendEnd && recvBeg==recvEnd) { sub->requests[buffSlot] = nullptr; // trivally finished request } else { + ssize_t nBytes = 0; if (args->coll == ncclFuncAllReduce) { + nBytes = sendEnd - sendBeg; if (sub->reg) { - size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE); - int count = (int)(nBytes / eltSize); - NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, sub->sendbuff, sub->recvbuff, count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sub->sendMhandle, sub->recvMhandle, sub->requests + buffSlot)); - if (sub->requests[buffSlot]) { - sub->nbytes -= nBytes; - sub->sendbuff += nBytes; - sub->recvbuff += nBytes; - } + NCCLCHECK(collNetRegIallreduce(proxyState, resources, args, sub, groupStart, &nBytes, &sub->requests[buffSlot])); } else { - int count = (sendEnd - sendBeg) / eltSize; - NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, region + sendBeg, region + recvBeg, count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests + buffSlot)); + NCCLCHECK(collNetIallreduce(proxyState, resources, args, sub, nBytes, sendBeg, recvBeg, &sub->requests[buffSlot])); } - } else { - sizePerRank = args->specifics.collnetDirect.sizePerRank; - if (args->coll == ncclFuncAllGather) { - ncclNetSGE_v8_t recvParts; - if (sub->reg) { - size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE); - void *sendbuff; - recvParts.mhandle = sub->recvMhandle; - recvParts.address = sub->recvbuff; - recvParts.size = nBytes; - if (sub->offset / sizePerRank == args->specifics.collnetDirect.node) { - sendbuff = sub->sendbuff + sub->offset % sizePerRank; - } else { - sendbuff = sub->sendbuff; - } - NCCLCHECK(proxyState->ncclCollNet->iallgather( - resources->collNetComm, sendbuff, 1, &recvParts, - sizePerRank, sub->offset, nBytes, - sub->sendMhandle, sub->requests + buffSlot)); - if (sub->requests[buffSlot]) { - sub->recvbuff += nBytes; - sub->nbytes -= nBytes; - sub->offset += nBytes; - } - } else { - recvParts.mhandle = recvMhandle; - recvParts.address = region + recvBeg; - recvParts.size = allEnd - allBeg; - NCCLCHECK(proxyState->ncclCollNet->iallgather( - resources->collNetComm, region + sendBeg, 1, &recvParts, - sizePerRank, allBeg, allEnd - allBeg, - sendMhandle, sub->requests + buffSlot)); - } + } else if (args->coll == ncclFuncAllGather) { + nBytes = allEnd - allBeg; + if (sub->reg) { + NCCLCHECK(collNetRegIallgather(proxyState, resources, args, sub, nBytes, allBeg, recvBeg, recvMhandle, &sub->requests[buffSlot])); } else { - ncclNetSGE_v8_t sendParts; - if (sub->reg) { - size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE); - void *recvbuff; - sendParts.mhandle = sub->sendMhandle; - sendParts.address = sub->sendbuff; - sendParts.size = nBytes; - if (sub->offset / sizePerRank == args->specifics.collnetDirect.node) { - recvbuff = sub->recvbuff + sub->offset % sizePerRank; - } else { - recvbuff = sub->recvbuff; - } - NCCLCHECK(proxyState->ncclCollNet->ireducescatter( - resources->collNetComm, 1, &sendParts, recvbuff, - sizePerRank, sub->offset, nBytes, - (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, - sub->recvMhandle, sub->requests + buffSlot)); - if (sub->requests[buffSlot]) { - sub->sendbuff += nBytes; - sub->nbytes -= nBytes; - sub->offset += nBytes; - } - } else { - sendParts.mhandle = sendMhandle; - sendParts.address = region + sendBeg; - sendParts.size = allEnd - allBeg; - NCCLCHECK(proxyState->ncclCollNet->ireducescatter( - resources->collNetComm, 1, &sendParts, region + recvBeg, - sizePerRank, allBeg, allEnd - allBeg, - (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, - recvMhandle, sub->requests + buffSlot)); - } + NCCLCHECK(collNetIallgather(proxyState, resources, args, sub, nBytes, allBeg, sendBeg, recvBeg, sendMhandle, recvMhandle, &sub->requests[buffSlot])); } - } - if (sub->requests[buffSlot] == nullptr) continue; - - if (args->coll == ncclFuncAllReduce) { - TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Iallreduce posted, size %d req %p", (long)sub->transmitted, group, buffSlot, int(sendEnd-sendBeg), sub->requests[buffSlot]); - } else if (args->coll == ncclFuncAllGather) { - TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Iallgather posted sendSize=%ld recvOffset=%ld recvSize=%ld request=%p", (long)sub->transmitted, group, buffSlot, long(sizePerRank), long(allBeg), long(allEnd-allBeg), sub->requests[buffSlot]); } else { - TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Ireducescatter posted sendOffset=%ld sendSize=%ld recvSize=%ld request=%p", (long)sub->transmitted, group, buffSlot, long(allBeg), long(allEnd-allBeg), long(sizePerRank), sub->requests[buffSlot]); + // reducescatter + nBytes = allEnd - allBeg; + if (sub->reg) { + NCCLCHECK(collNetRegIreducescatter(proxyState, resources, args, sub, nBytes, allBeg, sendBeg, sendMhandle, &sub->requests[buffSlot])); + } else { + NCCLCHECK(collNetIreducescatter(proxyState, resources, args, sub, nBytes, allBeg, sendBeg, recvBeg, sendMhandle, recvMhandle, &sub->requests[buffSlot])); + } } + if (nBytes > 0 && sub->requests[buffSlot] == nullptr) continue; } } sub->transmitted += args->sliceSteps; @@ -875,6 +985,52 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct return ncclSuccess; } +static ncclResult_t collNetRecvFlush(struct ncclProxyState* proxyState, struct recvResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, int groupStart, ssize_t nBytesIn, ssize_t recvBeg, void **request) { + char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]); + if (sub->reg && (sub->isOneRPN || args->coll != ncclFuncAllGather)) { + ssize_t nBytes, loopSize; + ssize_t offset = sub->offset + groupStart * args->chunkSize; + if (sub->isOneRPN) { + nBytes = std::min((size_t)sub->nbytes, resources->maxCollBytes); + loopSize = nBytes; + } else { + nBytes = std::min(sub->nbytes - sub->loopOffset, nBytesIn); + loopSize = sub->loopSize; + } + if (nBytes > 0) { + if (args->coll == ncclFuncReduceScatter) { + ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank; + ssize_t groupStartOffset = sub->offset + groupStart * args->chunkSize; + ssize_t groupEndOffset = groupStartOffset + nBytes; + int node = args->specifics.collnetDirect.node; + int startNode = groupStartOffset / sizePerRank; + int lastNode = groupEndOffset / sizePerRank; + if (startNode == node) { + offset = groupStartOffset % sizePerRank; + nBytes = std::min(sizePerRank - offset, nBytes); + } else if (startNode < node && node < lastNode) { + offset = 0; + nBytes = sizePerRank; + } else if (node == lastNode) { + offset = 0; + nBytes = groupEndOffset % sizePerRank; + } else { + // dummy flush + offset = 0; + } + } + NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, sub->recvbuff + offset + sub->loopOffset, nBytes, sub->recvMhandle, request)); + if (*request) { + sub->nbytes -= loopSize; + sub->offset += loopSize; + } + } + } else { + NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, region + recvBeg, nBytesIn, resources->mhandles[NCCL_PROTO_SIMPLE], request)); + } + return ncclSuccess; +} + static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { @@ -884,22 +1040,21 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->received = sub->flushed = sub->transmitted = sub->done = 0; resources->step = sub->base + sub->nsteps; + //adjust nsteps for registerd buffers as device signals a single step + if (sub->reg && sub->isOneRPN) sub->nsteps = DIVUP((size_t)sub->nbytes, resources->maxCollBytes); memset(sub->requests, 0, sizeof(sub->requests)); } args->state = ncclProxyOpProgress; } args->idle = 1; if (args->state == ncclProxyOpProgress) { - int p = NCCL_PROTO_SIMPLE; int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS); for (int s=0; snsubs; s++) { int group = s/COLLNET_GROUP_NSUBS; int groupStart = s - (s%COLLNET_GROUP_NSUBS); struct ncclProxySubArgs* sub = args->subs+s; struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); - void* mhandle = resources->mhandles[p]; auto reqFifo = resources->reqFifo; - char* region = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); // Enforce sync between operations of the same group. if (LAST_OF_GROUP(args, s) && (sub->posted < sub->done + calcStepsPerGroup(nGroups)) && (sub->posted < sub->nsteps)) { @@ -913,10 +1068,10 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct if (LAST_OF_GROUP(args, s) && (sub->received < sub->posted)) { int buffSlot = (sub->base+sub->received)%NCCL_STEPS; if (!reqFifo[group][buffSlot].turnIsSendNotRecv) { // Buffer is cleared : coll is complete - int recvBeg = calcRegionOffset(args, 1, groupStart, sub->received, 0); - int recvEnd = calcRegionOffset(args, 1, s, sub->received, 1); - int totalSize = recvEnd - recvBeg; - TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] received, size %d chunkSize=%d", (long)sub->received, group, buffSlot, totalSize, args->chunkSize); + ssize_t recvBeg = calcRegionOffset(args, 1, groupStart, sub->received, 0); + ssize_t recvEnd = calcRegionOffset(args, 1, s, sub->received, 1); + ssize_t totalSize = recvEnd - recvBeg; + TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] received, size %ld chunkSize=%ld", (long)sub->received, group, buffSlot, totalSize, args->chunkSize); sub->received += args->sliceSteps; if ((reqFifo[group][buffSlot].size > 0 || sub->reg) && resources->useGdr && resources->needFlush) { // GDRCOPY support @@ -929,37 +1084,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct return ncclInternalError; #endif } else { - if (sub->reg) { - size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE); - size_t offset = 0; - if (args->coll == ncclFuncReduceScatter) { - size_t sizePerRank = args->specifics.collnetDirect.sizePerRank; - int node = args->specifics.collnetDirect.node; - int startNode = sub->offset / sizePerRank; - int lastNode = (sub->offset + nBytes) / sizePerRank; - if (startNode == node) { - offset = sub->offset % sizePerRank; - nBytes = std::min(sizePerRank - offset, nBytes); - } else if (startNode < node && node < lastNode) { - nBytes = sizePerRank; - } else if (node == lastNode) { - nBytes = (sub->offset + nBytes) % sizePerRank; - } else { - // no need to flush - nBytes = 0; - } - } - NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, sub->recvbuff + offset, nBytes, sub->recvMhandle, sub->requests+buffSlot)); - if (sub->requests[buffSlot]) { - sub->nbytes -= nBytes; - sub->offset += nBytes; - if (args->coll == ncclFuncAllGather || args->coll == ncclFuncAllReduce) { - sub->recvbuff += nBytes; - } - } - } else { - NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, region+recvBeg, totalSize, mhandle, sub->requests+buffSlot)); - } + NCCLCHECK(collNetRecvFlush(proxyState, resources, args, sub, groupStart, totalSize, recvBeg, &sub->requests[buffSlot])); } } args->idle = 0; @@ -980,14 +1105,19 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct } } if (sub->transmitted < sub->flushed) { - if (sub->reg == 0) { + if (sub->reg == 0 || (!sub->isOneRPN && args->coll == ncclFuncAllGather)) { int buffSlot = (sub->base + sub->transmitted)%NCCL_STEPS; volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo; connFifo[buffSlot].offset = calcRegionOffset(args, 1, s, sub->transmitted, 0); __sync_synchronize(); } volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail; - *recvTail = sub->base + sub->flushed; + if (sub->reg && sub->isOneRPN) { + // We may have bumped net steps, but reg operations only have a single step w.r.t. the GPU. + if (sub->flushed == sub->nsteps) *recvTail = sub->base + args->sliceSteps; + } else { + *recvTail = sub->base + sub->flushed; + } if (resources->gdcSync) wc_store_fence(); // Flush out WC write sub->transmitted += args->sliceSteps; args->idle = 0; @@ -999,7 +1129,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct bool groupSync = s==0 ? args->subs[args->nsubs-1].done == sub->done : (sub-1)->done > sub->done; volatile uint64_t* sendHead = &resources->sendMem->head; - if (groupSync && sub->done < sub->transmitted && (sub->base+sub->done) < *sendHead) { + int done = sub->reg && sub->isOneRPN ? 0 : sub->done; + if (groupSync && sub->done < sub->transmitted && sub->base + done < *sendHead) { sub->done += args->sliceSteps; args->idle = 0; if (sub->done == sub->nsteps && s == args->nsubs-1) { @@ -1017,24 +1148,22 @@ struct collnetRegInfo { size_t size; }; -ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle) { +static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, struct ncclReg* regRecord, int* outRegBufFlag, void** outHandle) { ncclResult_t ret = ncclSuccess; - struct ncclReg *regRecord = NULL; + if (regRecord) { + if (regRecord->state & COLLNET_REG_COMPLETE) { + // reuse previous registration + *outRegBufFlag = 2; + *outHandle = regRecord->collnetHandle; + INFO(NCCL_REG, "rank %d - COLLNET reuse register userbuff %p (handle %p), buffSize %ld, type %s", comm->rank, userbuff, regRecord->collnetHandle, buffSize, type == collNetRecv ? "Recv" : "Send"); + goto exit; + } else { + /* start register collnet buffer */ + struct collnetRegInfo info = { regRecord->addr, regRecord->pages * comm->regCache.pageSize }; + void* handle = NULL; + struct ncclConnInfo* conn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].conn : &comm->channels[0].peers[comm->nRanks]->send[type].conn; - *outRegBufFlag = 0; - *outHandle = NULL; - if (comm && userbuff && buffSize > 0) { - NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, ®Record), ret, fail); - if (regRecord) { - if (regRecord->state & COLLNET_REG_COMPLETE) { - // reuse previous registration - *outRegBufFlag = 2; - *outHandle = regRecord->collnetHandle; - goto exit; - } else { - /* start register collnet buffer */ - struct collnetRegInfo info = {regRecord->addr, regRecord->pages * comm->regCache.pageSize}; - void* handle = NULL; + if (conn->flags & NCCL_DIRECT_NIC) { struct ncclProxyConnector* proxyconn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn; NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail); if (handle) { @@ -1042,11 +1171,13 @@ ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* u regRecord->collnetProxyconn = proxyconn; *outHandle = regRecord->collnetHandle = handle; *outRegBufFlag = 1; + INFO(NCCL_REG, "rank %d - COLLNET register userbuff %p (handle %p), buffSize %ld, type %s", comm->rank, userbuff, handle, buffSize, type == collNetRecv ? "Recv" : "Send"); } + } else { + WARN("rank %d - COLLNET failed to register userbuff %p (handle %p), buffSize %ld, type %s, GDR is not enabled", comm->rank, userbuff, handle, buffSize, type == collNetRecv ? "Recv" : "Send"); } } } - exit: return ret; fail: @@ -1055,44 +1186,63 @@ ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* u goto exit; } +ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle) { + ncclResult_t ret = ncclSuccess; + struct ncclReg *regRecord = NULL; + bool isValid = false; + + *outRegBufFlag = 0; + *outHandle = NULL; + if (comm && userbuff && buffSize > 0) { + NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, ®Record), ret, fail); + NCCLCHECKGOTO(ncclRegLocalIsValid(regRecord, &isValid), ret, fail); + if (isValid) + NCCLCHECKGOTO(collnetRegisterBuffer(comm, userbuff, buffSize, type, regRecord, outRegBufFlag, outHandle), ret, fail); + } +exit: + return ret; +fail: + *outRegBufFlag = 0; + goto exit; +} + struct ncclCollnetCleanupCallback { struct ncclCommCallback base; - struct ncclProxyConnector* proxyConn; - void* buffer; - size_t size; - void* mhandle; + struct ncclComm *comm; + struct ncclReg *reg; }; static ncclResult_t cleanupCollnet(struct ncclComm* comm, struct ncclCommCallback* cb) { struct ncclCollnetCleanupCallback* obj = (struct ncclCollnetCleanupCallback*)cb; - NCCLCHECK(ncclCollnetDeregBuffer(comm, obj->proxyConn, obj->mhandle)); - INFO(NCCL_REG, "rank %d - deregistered collnet buffer handle %p, size %ld, buff %p", comm->rank, obj->mhandle, obj->size, obj->buffer); + NCCLCHECK(ncclCommGraphDeregister(obj->comm, obj->reg)); free(obj); return ncclSuccess; } ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue* cleanupQueue, int* nCleanupQueueElts) { ncclResult_t ret = ncclSuccess; - void* handle = NULL; - struct ncclRegCache* cache = &comm->regCache; - uintptr_t pageSize = cache->pageSize; - uintptr_t addr = (uintptr_t)userbuff & -pageSize; - size_t size = DIVUP((uintptr_t)userbuff - addr + buffSize, pageSize) * pageSize; - collnetRegInfo info = {addr, size}; struct ncclCollnetCleanupCallback* record = NULL; - struct ncclProxyConnector* proxyConn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn; + struct ncclReg *regRecord = NULL; + void *baseSend = NULL; + size_t baseSendSize = 0; *outRegBufFlag = 0; - NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail); - record = (struct ncclCollnetCleanupCallback*)malloc(sizeof(struct ncclCollnetCleanupCallback)); - record->base.fn = cleanupCollnet; - record->proxyConn = proxyConn; - record->buffer = (void*)userbuff; - record->size = buffSize; - *outHandle = record->mhandle = handle; - *outRegBufFlag = 1; - ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record); - *nCleanupQueueElts += 1; + if (comm && userbuff && buffSize > 0) { + CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)userbuff), ret, fail); + NCCLCHECKGOTO(ncclCommGraphRegister(comm, baseSend, baseSendSize, (void**)®Record), ret, fail); + NCCLCHECKGOTO(collnetRegisterBuffer(comm, userbuff, buffSize, type, regRecord, outRegBufFlag, outHandle), ret, fail); + + if (*outRegBufFlag) { + record = (struct ncclCollnetCleanupCallback*)malloc(sizeof(struct ncclCollnetCleanupCallback)); + record->base.fn = cleanupCollnet; + record->comm = comm; + record->reg = regRecord; + ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record); + *nCleanupQueueElts += 1; + } else { + NCCLCHECKGOTO(ncclCommGraphDeregister(comm, regRecord), ret, fail); + } + } exit: return ret; @@ -1104,6 +1254,7 @@ ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* u ncclResult_t ncclCollnetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyconn, void* handle) { NCCLCHECK(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgDeregister, &handle, sizeof(void*), NULL, 0)); + INFO(NCCL_REG, "rank %d - COLLNET deregistered buffer handle %p", comm->rank, handle); return ncclSuccess; } @@ -1111,26 +1262,67 @@ static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, s void* handle; struct collnetRegInfo* info = (struct collnetRegInfo*)reqBuff; struct sendResources* resources = (struct sendResources*)(connection->transportResources); + ncclResult_t ret = ncclSuccess; + bool needReg = true; assert(reqSize == sizeof(struct collnetRegInfo)); assert(respSize == sizeof(void*)); - if (proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle) != ncclSuccess) handle = NULL; + +#if CUDART_VERSION >= 11070 + /* DMA-BUF support */ + if (resources->useGdr && resources->useDmaBuf) { + int dmabuf_fd; + CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem); + NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem); + (void)close(dmabuf_fd); + needReg = false; + } +#endif +peermem: + if (needReg) { + NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail); + } + +exit: memcpy(respBuff, (void*)&handle, sizeof(void*)); *done = 1; return ncclSuccess; +fail: + handle = NULL; + goto exit; } static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { void* handle; struct collnetRegInfo* info = (struct collnetRegInfo*)reqBuff; struct recvResources* resources = (struct recvResources*)(connection->transportResources); + ncclResult_t ret = ncclSuccess; + bool needReg = true; assert(reqSize == sizeof(struct collnetRegInfo)); assert(respSize == sizeof(void*)); - if (proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle) != ncclSuccess) handle = NULL; + #if CUDART_VERSION >= 11070 + /* DMA-BUF support */ + if (resources->useGdr && resources->useDmaBuf) { + int dmabuf_fd; + CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem); + NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem); + (void)close(dmabuf_fd); + needReg = false; + } +#endif +peermem: + if (needReg) { + NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail); + } + +exit: memcpy(respBuff, (void*)&handle, sizeof(void*)); *done = 1; return ncclSuccess; +fail: + handle = NULL; + goto exit; } static ncclResult_t sendProxyDeregBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) { @@ -1155,13 +1347,6 @@ static ncclResult_t recvProxyDeregBuffer(struct ncclProxyConnection* connection, return ncclSuccess; } -struct ncclTransport collNetTransport = { - "COL", - canConnect, - { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer }, - { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer } -}; - ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm) { ncclResult_t ret = ncclSuccess; char line[1024]; @@ -1197,7 +1382,6 @@ ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm) { ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm) { ncclResult_t ret = ncclSuccess; - int highestTransportType0 = TRANSPORT_UNDEFINED, highestTransportType1 = TRANSPORT_UNDEFINED; if (comm->collNetSupport == 0) goto exit; @@ -1206,13 +1390,13 @@ ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm) { struct ncclChannel* channelRecv = comm->channels + c; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.down, 0), ret, fail); } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 0, &highestTransportType0), ret, fail); + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 0), ret, fail); for (int c = 0; c < comm->nChannels; c++) { struct ncclChannel* channelSend = comm->channels + c; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.down, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.up, 1), ret, fail); } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 1, &highestTransportType1), ret, fail); + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 1), ret, fail); INFO(NCCL_INIT, "rank %d Connected CollNet", comm->rank); @@ -1410,3 +1594,10 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop comm->collNetSupport = 0; goto exit; } + +struct ncclTransport collNetTransport = { + "COL", + canConnect, + { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer }, + { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer } +}; \ No newline at end of file diff --git a/src/transport/generic.cc b/src/transport/generic.cc index 7fd7e59fb..47b023667 100644 --- a/src/transport/generic.cc +++ b/src/transport/generic.cc @@ -1,17 +1,37 @@ #include "comm.h" #include "transport.h" +#include "bootstrap.h" ncclResult_t ncclTransportRingConnect(struct ncclComm* comm) { + struct ringConnInfo { + bool useNetPXN; + bool useGdr; + }; + struct ringConnInfo* ringInfo = NULL; ncclResult_t ret = ncclSuccess; if (comm && comm->nRanks > 1) { + comm->useGdr = true; + comm->useNetPXN = false; for (int c = 0; c < comm->nChannels; c++) { struct ncclChannel* channel = comm->channels + c; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail); } NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_RING], 0), ret, fail); - INFO(NCCL_INIT, "Connected all rings"); + if (ncclParamLocalRegister() || ncclParamGraphRegister()) { + NCCLCHECK(ncclCalloc(&ringInfo, comm->nRanks)); + ringInfo[comm->rank].useGdr = comm->useGdr; + ringInfo[comm->rank].useNetPXN = comm->useNetPXN; + NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, ringInfo, sizeof(struct ringConnInfo)), ret, fail); + for (int i = 0; i < comm->nRanks; ++i) { + if (!ringInfo[i].useGdr) comm->useGdr = false; + if (ringInfo[i].useNetPXN) comm->useNetPXN = true; + if (comm->useGdr == false && comm->useNetPXN == true) break; + } + } + INFO(NCCL_INIT, "Connected all rings, use ring PXN %d GDR %d", comm->useNetPXN, comm->useGdr); } exit: + free(ringInfo); return ret; fail: goto exit; diff --git a/src/transport/net.cc b/src/transport/net.cc index 00eca607d..8760b4258 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -15,6 +15,7 @@ #include "profiler.h" #include "transport.h" #include "shm.h" +#include static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large"); @@ -107,6 +108,7 @@ struct sendNetResources { int netDeviceVersion; ncclNetDeviceType netDeviceType; ncclNetDeviceHandle_t* netDeviceHandle; + size_t maxP2pBytes; }; struct recvNetResources { @@ -139,6 +141,12 @@ struct recvNetResources { int netDeviceVersion; ncclNetDeviceType netDeviceType; ncclNetDeviceHandle_t* netDeviceHandle; + size_t maxP2pBytes; +}; + +struct netRegInfo { + uintptr_t buffer; + size_t size; }; /* Determine if two peers can communicate with NET */ @@ -166,6 +174,9 @@ struct setupReq { int connIndex; }; +NCCL_PARAM(NetOptionalRecvCompletion, "NET_OPTIONAL_RECV_COMPLETION", 1); + +static_assert(sizeof(ncclNetHandle_t) + sizeof(int) <= CONNECT_SIZE, "Not large enough ncclConnect to hold ncclNetHandle_t and useGdr flag"); // Forward declaration static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args); @@ -181,8 +192,10 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph int proxyRank; int64_t netId; NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &netId, &req.netDev, &proxyRank)); - NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr)); + NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 1, &req.useGdr)); send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; + if (!req.useGdr && connIndex == 0) comm->useGdr = 0; + if (proxyRank != myInfo->rank && connIndex == 0) comm->useNetPXN = true; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn)); req.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; @@ -198,6 +211,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); } *((int*)connectInfo) = comm->topParentRanks[proxyRank]; + memcpy((uint8_t*)connectInfo + sizeof(ncclNetHandle_t), &req.useGdr, sizeof(int)); return ncclSuccess; } @@ -218,10 +232,12 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph int proxyRank; int64_t netId; NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &netId, &req.netDev, &proxyRank)); - NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr)); + NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 0, &req.useGdr)); + recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; + if (!req.useGdr && connIndex == 0) comm->useGdr = 0; // Determine whether we need to flush the GDR buffer on recv or not - if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush)); + if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, req.netDev, myInfo->rank, &req.needFlush)); // We don't support PXN on receive yet NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn)); @@ -230,6 +246,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph req.tpRank = comm->topParentRanks[myInfo->rank]; req.tpRemoteRank = comm->topParentRanks[peerInfo->rank]; NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t))); + memcpy((uint8_t*)connectInfo + sizeof(ncclNetHandle_t), &req.useGdr, sizeof(int)); INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->nvmlDev, myInfo->rank, myInfo->nvmlDev, comm->ncclNet->name, req.netDev, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); return ncclSuccess; @@ -283,8 +300,11 @@ struct netRecvConnectArgs { static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { struct connectMap* map = (connectMap*) send->transportResources; - void* opId; + int recvUseGdr; + + memcpy(&recvUseGdr, (uint8_t*)connectInfo + sizeof(ncclNetHandle_t), sizeof(int)); + if (!recvUseGdr) send->conn.flags &= ~NCCL_DIRECT_NIC; // map isn't allocated thus this op hasn't been submitted yet if (!map) { @@ -391,6 +411,11 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { struct connectMap* map = (connectMap*) recv->transportResources; void* opId; + int sendUseGdr; + + memcpy(&sendUseGdr, (uint8_t*)connectInfo + sizeof(ncclNetHandle_t), sizeof(int)); + if (!sendUseGdr) recv->conn.flags &= ~NCCL_DIRECT_NIC; + if (!map) { NCCLCHECK(ncclCalloc(&map, 1)); recv->transportResources = map; @@ -522,7 +547,7 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int return ncclSuccess; } -static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int channel, int slot, int* offset, int* size) { +static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int channel, int slot, int* offset, size_t* size) { // Use different pools for different channels and also separate send/recv. int globalSlot = (channel*NCCL_SHARED_STEPS)+slot; *offset = proxyState->p2pChunkSize * globalSlot; @@ -590,6 +615,13 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc resources->netDeviceVersion = props.netDeviceVersion; resources->netDeviceType = props.netDeviceType; + /* point-to-point size limits*/ + resources->maxP2pBytes = props.maxP2pBytes; + if((resources->maxP2pBytes <= 0) || (resources->maxP2pBytes > NCCL_MAX_NET_SIZE_BYTES)) { + WARN("sendProxySetup: net plugin returned invalid value for maxP2pBytes %ld \ + [allowed range: %ld - %ld] \n", resources->maxP2pBytes, 0L, NCCL_MAX_NET_SIZE_BYTES); + return ncclInternalError; + } // We don't return any data if (respSize != 0) return ncclInternalError; @@ -621,6 +653,13 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc resources->maxRecvs = props.maxRecvs; resources->netDeviceVersion = props.netDeviceVersion; resources->netDeviceType = props.netDeviceType; + /* point-to-point size limits*/ + resources->maxP2pBytes = props.maxP2pBytes; + if((resources->maxP2pBytes <= 0) || (resources->maxP2pBytes > NCCL_MAX_NET_SIZE_BYTES)) { + WARN("recvProxySetup: net plugin returned invalid value for maxP2pBytes %ld \ + [allowed range: %ld - %ld] \n", resources->maxP2pBytes, 0L, NCCL_MAX_NET_SIZE_BYTES); + return ncclInternalError; + } if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError; NCCLCHECK(proxyState->ncclNet->listen(req->netDev, respBuff, &resources->netListenComm)); @@ -916,6 +955,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem); resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem); + for (int i = 0; i < NCCL_STEPS; i++) resources->recvMem->connFifo[i].size = -1; for (int p=0; pbuffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]); if (resources->buffers[p]) { @@ -1032,7 +1072,6 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct } static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps"); -#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two. static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { @@ -1045,11 +1084,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct resources->step = sub->base + sub->nsteps; sub->posted = sub->transmitted = sub->done = 0; ncclProfilerStartSendProxyOpEvent(s, args); - if (sub->reg && sub->nbytes > 0) { - NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle)); - } else { - sub->mhandle = resources->mhandles[args->protocol]; - } + if (!sub->reg) + sub->sendMhandle = resources->mhandles[args->protocol]; } args->state = ncclProxyOpProgress; } @@ -1059,6 +1095,9 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct int maxDepth = std::min(NCCL_STEPS, NCCL_SHARED_STEPS/args->nsubs); for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; + int postedStepId = sub->posted; + int transmittedStepId = sub->transmitted; + int doneStepId = sub->done; if (sub->done == sub->nsteps) continue; struct sendNetResources* resources = (struct sendNetResources*) (sub->connection->transportResources); volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo; @@ -1066,7 +1105,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); // Post buffers to the GPU if (sub->posted < sub->nsteps && sub->posted < sub->done + maxDepth) { - ncclProfilerStartSendProxyStepEvents(s, args, sub->posted, sub->posted+args->sliceSteps); + ncclProfilerStartSendProxyStepEvent(s, args, postedStepId); int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; if (resources->shared) { if (!sub->reg) { @@ -1078,12 +1117,13 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct } volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head; sub->posted += args->sliceSteps; - // Only post one credit for registered buffer - if (sub->reg == 0 || sub->posted == args->sliceSteps) *sendHead = sub->base + sub->posted - NCCL_STEPS; + *sendHead = sub->base + sub->posted - NCCL_STEPS; if (resources->gdcSync) wc_store_fence(); // Flush out WC write - } else sub->posted += args->sliceSteps; + } else { + sub->posted += args->sliceSteps; + } ncclProfilerRecordProxyOpEventState(s, args, sub->posted, sub->transSize, ncclProfilerProxyOpSendPosted); - ncclProfilerRecordProxyStepEventStates(s, args, sub->posted-args->sliceSteps, sub->posted, ncclProfilerProxyStepSendGPUWait); + ncclProfilerRecordProxyStepEventState(s, args, postedStepId, ncclProfilerProxyStepSendGPUWait); args->idle = 0; continue; } @@ -1091,10 +1131,10 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct if (sub->transmitted < sub->posted && sub->transmitted < sub->done + NCCL_STEPS) { int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS; volatile uint64_t* recvTail = &resources->recvMem->tail; - uint64_t tail = sub->base + (sub->reg ? 0 : sub->transmitted); - if ((sub->reg || connFifo[buffSlot].size != -1) && ((*recvTail > tail) || p == NCCL_PROTO_LL)) { + uint64_t tail = sub->base + sub->transmitted; + if (connFifo[buffSlot].size != -1 && (*recvTail > tail || p == NCCL_PROTO_LL)) { // We have something to receive, let's check if it's completely ready. - int size = sub->reg ? std::min(MAX_NET_SIZE, sub->nbytes) : connFifo[buffSlot].size; + int size = connFifo[buffSlot].size; bool shared = (p == NCCL_PROTO_SIMPLE) && resources->shared; char* buff = shared ? localBuff+connFifo[buffSlot].offset : localBuff+buffSlot*stepSize; int ready = 1; @@ -1120,22 +1160,28 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct volatile uint32_t *f2 = &lines[i].flag2; if (f1[0] != flag || f2[0] != flag) { ready = 0; break; } } - } else if (p == NCCL_PROTO_SIMPLE && resources->shared) { - buff = sub->reg ? (char*)sub->recvbuff : localBuff+resources->recvMem->connFifo[buffSlot].offset; + } else if (p == NCCL_PROTO_SIMPLE) { + if (resources->shared) { + buff = sub->reg ? (char*)sub->sendbuff + sub->transmitted * NCCL_MAX_NET_SIZE : localBuff + resources->recvMem->connFifo[buffSlot].offset; + } else if (sub->reg) { + size_t sendSize; + sub->ringAlgo->getNextSendAddr(sub->transmitted, (uint8_t**)&buff, &sendSize, &sub->sendMhandle); + assert(sendSize == size); + } } if (ready) { - ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted + args->sliceSteps, sub->transSize, ncclProfilerProxyOpSendRemFifoWait); + ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted+args->sliceSteps, sub->transSize, ncclProfilerProxyOpSendRemFifoWait); // Data is ready, try to send. // Coverity complains about the size here as pointing to an out-of-scope temporary. Which is nonsense, // since size is a plain integer. // coverity[use_invalid:FALSE] - NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->mhandle, sub->requests+buffSlot)); + NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, sub->requests+buffSlot)); if (sub->requests[buffSlot] != NULL) { - TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p, size %d, proto %d, myRank %d, channelId %d", sub->transmitted, buffSlot, sub->requests[buffSlot], size, p, proxyState->tpRank, sub->channelId); + TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Isend posted, req %p, buff %p, size %d, proto %d, myRank %d, channelId %d, mhandle %p", sub->transmitted, buffSlot, sub->nsteps, sub->requests[buffSlot], buff, size, p, proxyState->tpRank, sub->channelId, sub->sendMhandle); + sub->transSize += size; sub->transmitted += args->sliceSteps; ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpSendTransmitted); - ncclProfilerRecordProxyStepEventStates(s, args, sub->transmitted-args->sliceSteps, sub->transmitted, ncclProfilerProxyStepSendWait); - sub->transSize += size; + ncclProfilerRecordProxyStepEventState(s, args, transmittedStepId, ncclProfilerProxyStepSendWait); args->idle = 0; continue; } @@ -1149,41 +1195,24 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct int buffSlot = (sub->base+sub->done)%NCCL_STEPS; NCCLCHECK(proxyState->ncclNet->test(sub->requests[buffSlot], &done, &size)); if (done) { - if (sub->reg) { - if (size < sub->nbytes) { - sub->recvbuff += size; - sub->nbytes -= size; - // Do one more step (at least) - sub->nsteps++; - } else { - // Signal the GPU the send is complete and it can return. - connFifo[sub->base%NCCL_STEPS].size = -1; - } - } // Make sure size is reset to -1 before we update the head. - if (sub->reg == 0) connFifo[buffSlot].size = -1; + connFifo[buffSlot].size = -1; __sync_synchronize(); - TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]); + TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] request %p done", sub->done, buffSlot, sub->nsteps, sub->requests[buffSlot]); sub->done += args->sliceSteps; - ncclProfilerStopProxyStepEvents(s, args, sub->done-args->sliceSteps, sub->done); + ncclProfilerStopProxyStepEvent(s, args, doneStepId); ncclProfilerRecordProxyOpEventState(s, args, sub->done, sub->transSize, ncclProfilerProxyOpSendDone); if (resources->shared == 0) { volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head; - if (sub->reg) { - // We may have added more net steps, but reg operations only have a single step w.r.t. the GPU. - if (sub->done == sub->nsteps) *sendHead = sub->base + args->sliceSteps; - } else { - *sendHead = sub->base + sub->done; - } + *sendHead = sub->base + sub->done; if (resources->gdcSync) wc_store_fence(); // Flush out WC write } args->idle = 0; if (sub->done == sub->nsteps) { - if (sub->reg && sub->nbytes > 0) { - NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, sub->mhandle)); - } args->done++; + if (sub->ringAlgo && sub->ringAlgo->decRefCount() == 0) delete sub->ringAlgo; + sub->ringAlgo = NULL; } } } @@ -1232,14 +1261,11 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct // Set step base for next op resources->step = sub->base + sub->nsteps; sub->posted = sub->received = sub->transmitted = sub->done = 0; + sub->regBufferReady = 0; for (int i=0; ireg && sub->nbytes > 0) { - // Register buffer - NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle)); - } else { - sub->mhandle = resources->mhandles[args->protocol]; - } + if (!sub->reg) + sub->recvMhandle = resources->mhandles[args->protocol]; } args->state = ncclProxyOpProgress; } @@ -1251,32 +1277,44 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct struct ncclProxySubArgs* subGroup = args->subs+s; int subCount = 0; void* ptrs[NCCL_PROXY_MAX_SUBS]; - int sizes[NCCL_PROXY_MAX_SUBS]; + size_t sizes[NCCL_PROXY_MAX_SUBS]; int tags[NCCL_PROXY_MAX_SUBS]; void* mhandles[NCCL_PROXY_MAX_SUBS]; for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; + int postedStepId = sub->posted; if (sub->posted < sub->nsteps) { if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; } - ncclProfilerStartRecvProxyStepEvents(s+i, args, sub->posted, sub->posted+args->sliceSteps); + ncclProfilerStartRecvProxyStepEvent(s+i, args, postedStepId); struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); - if (sub->reg) maxDepth = 1; int stepSize = resources->buffSizes[p] / NCCL_STEPS; char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo; - if (p == NCCL_PROTO_SIMPLE && resources->shared) { - if (sub->reg) { - // Wait until CUDA kernel has started before we access the user buffer directly. - if (connFifo[sub->base%NCCL_STEPS].size == -1) continue; - ptrs[subCount] = sub->recvbuff; - sizes[subCount] = std::min(MAX_NET_SIZE, sub->nbytes); + if (p == NCCL_PROTO_SIMPLE) { + if (resources->shared) { + if (sub->reg) { + // Wait until CUDA kernel has started before we access the user buffer directly. + if (!sub->regBufferReady && connFifo[sub->base % NCCL_STEPS].size == -1) continue; + sub->regBufferReady = 1; + ptrs[subCount] = sub->recvbuff + sub->posted * NCCL_MAX_NET_SIZE; + sizes[subCount] = std::min(NCCL_MAX_NET_SIZE, (ssize_t)(sub->nbytes - sub->posted * NCCL_MAX_NET_SIZE)); + } else { + int sharedBuffSlot = sub->posted % maxDepth; + int offset; + NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot * args->nsubs + s + i, &offset, sizes + subCount)); + connFifo[buffSlot].offset = offset; + ptrs[subCount] = localBuff + offset; + } } else { - int sharedBuffSlot = sub->posted%maxDepth; - int offset; - NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset, sizes+subCount)); - connFifo[buffSlot].offset = offset; - ptrs[subCount] = localBuff+offset; + if (sub->reg) { + if (!sub->regBufferReady && connFifo[sub->base % NCCL_STEPS].size == -1) continue; + sub->regBufferReady = 1; + sub->ringAlgo->getNextRecvAddr(sub->posted, (uint8_t**)&ptrs[subCount], &sizes[subCount], &sub->recvMhandle); + } else { + ptrs[subCount] = localBuff + buffSlot * stepSize; + sizes[subCount] = stepSize * args->sliceSteps; + } } } else { ptrs[subCount] = localBuff+buffSlot*stepSize; @@ -1284,7 +1322,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct } if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes; tags[subCount] = resources->tpRemoteRank; - mhandles[subCount] = sub->mhandle; + mhandles[subCount] = sub->recvMhandle; subCount++; } } @@ -1292,15 +1330,19 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct uint64_t step = subGroup->posted; struct recvNetResources* resources = (struct recvNetResources*) (subGroup->connection->transportResources); void** requestPtr = subGroup->requests+(step%NCCL_STEPS); + bool ignoreCompletion = ncclParamNetOptionalRecvCompletion() && ((args->protocol == NCCL_PROTO_LL128) || (args->protocol == NCCL_PROTO_LL)) && (subCount == 1); + if (ignoreCompletion) *requestPtr = (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION; NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr)); if (*requestPtr) { subGroup->recvRequestsCache[step%NCCL_STEPS] = *requestPtr; subGroup->recvRequestsSubCount = subCount; for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup+i; + int postedStepId = sub->posted; + TRACE(NCCL_NET, "recvProxy [%ld/%ld/%d] Irecv posted, buff %p, size %ld, myRank %d, channelId %d, mhandle %p", sub->posted, (sub->base + sub->posted) % NCCL_STEPS, sub->nsteps, ptrs[i], sizes[i], proxyState->tpRank, sub->channelId, mhandles[i]); sub->posted += args->sliceSteps; ncclProfilerRecordProxyOpEventState(s+i, args, sub->posted, sub->transSize, ncclProfilerProxyOpRecvPosted); - ncclProfilerRecordProxyStepEventStates(s+i, args, sub->posted-args->sliceSteps, sub->posted, ncclProfilerProxyStepRecvWait); + ncclProfilerRecordProxyStepEventState(s+i, args, postedStepId, ncclProfilerProxyStepRecvWait); } args->idle = 0; } @@ -1321,31 +1363,18 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct if (done) { int needFlush = 0; int totalSize = 0; - int subIndex = 0; for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; - if (sub->received < sub->nsteps) { - int size = sizes[subIndex++]; - if (sub->reg) { - if (size < sub->nbytes) { - sub->recvbuff += size; - sub->nbytes -= size; - // Do one more step (at least) - sub->nsteps++; - } else { - // Reset connFifo size indicating the GPU was ready to receive. - // There is a __sync_synchronize() later to ensure it is reset before it is set again by the GPU. - struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); - volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo; - connFifo[sub->base%NCCL_STEPS].size = -1; - } - } - } - sub->received += args->sliceSteps; + int receivedStepId = sub->received; + int buffSlot = (sub->base + sub->received) % NCCL_STEPS; + struct recvNetResources* resources = (struct recvNetResources*)(sub->connection->transportResources); + volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo; + connFifo[buffSlot].size = -1; sub->transSize += sizes[i]; + sub->received += args->sliceSteps; ncclProfilerRecordProxyOpEventState(s+i, args, sub->received, sub->transSize, ncclProfilerProxyOpRecvReceived); - ncclProfilerRecordProxyStepEventStates(s+i, args, sub->received-args->sliceSteps, sub->received, ncclProfilerProxyStepRecvFlushWait); + ncclProfilerRecordProxyStepEventState(s+i, args, receivedStepId, ncclProfilerProxyStepRecvFlushWait); if (step < sub->nsteps) { struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); if (resources->useGdr) needFlush |= resources->needFlush; @@ -1372,10 +1401,16 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct int stepSize = resources->buffSizes[p] / NCCL_STEPS; char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); int buffSlot = (sub->base+sub->received-args->sliceSteps)%NCCL_STEPS; - ptrs[subCount] = resources->shared ? - (sub->reg ? (char*)sub->recvbuff : localBuff+resources->recvMem->connFifo[buffSlot].offset) : - localBuff+buffSlot*stepSize; - mhandles[subCount] = sub->mhandle; + if (resources->shared) { + ptrs[subCount] = sub->reg ? (char*)sub->recvbuff + step * NCCL_MAX_NET_SIZE : localBuff + resources->recvMem->connFifo[buffSlot].offset; + } else { + if (sub->reg) { + sub->ringAlgo->getNextRecvAddr(step, (uint8_t**)&ptrs[subCount], NULL, &sub->recvMhandle); + } else { + ptrs[subCount] = localBuff + buffSlot * stepSize; + } + } + mhandles[subCount] = sub->recvMhandle; subCount++; } } @@ -1399,19 +1434,16 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct if (done) { for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; + int transmittedStepId = sub->transmitted; sub->transmitted += args->sliceSteps; ncclProfilerRecordProxyOpEventState(s+i, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpRecvTransmitted); - ncclProfilerRecordProxyStepEventStates(s+i, args, sub->transmitted-args->sliceSteps, sub->transmitted, ncclProfilerProxyStepRecvGPUWait); + ncclProfilerRecordProxyStepEventState(s+i, args, transmittedStepId, ncclProfilerProxyStepRecvGPUWait); if (step < sub->nsteps) { __sync_synchronize(); struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail; - if (sub->reg) { - // We may have added more net steps, but reg operations only have a single step w.r.t. the GPU. - if (sub->transmitted == sub->nsteps) *recvTail = sub->base + args->sliceSteps; - } else - *recvTail = sub->base + sub->transmitted; + *recvTail = sub->base + sub->transmitted; if (resources->gdcSync) wc_store_fence(); // Flush out WC write } } @@ -1425,11 +1457,12 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct struct ncclProxySubArgs* subGroup = args->subs+s; for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; + int doneStepId = sub->done; if (sub->done == sub->nsteps) continue; if (sub->transmitted > sub->done) { struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); volatile uint64_t* sendHead = &resources->sendMem->head; - uint64_t done = sub->reg ? sub->base + sub->nsteps : *sendHead; + uint64_t done = *sendHead; while (done > sub->base + sub->done && // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted. sub->transmitted > sub->done) { @@ -1440,15 +1473,13 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct subGroup->recvRequestsCache[sub->done%NCCL_STEPS] = NULL; } sub->done += args->sliceSteps; - ncclProfilerStopProxyStepEvents(s+i, args, sub->done-args->sliceSteps, sub->done); + ncclProfilerStopProxyStepEvent(s+i, args, doneStepId); ncclProfilerRecordProxyOpEventState(s+i, args, sub->done, sub->transSize, ncclProfilerProxyOpRecvDone); args->idle = 0; if (sub->done == sub->nsteps) { - struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); - if (sub->reg && sub->nbytes > 0) { - NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, sub->mhandle)); - } args->done++; + if (sub->ringAlgo && sub->ringAlgo->decRefCount() == 0) delete sub->ringAlgo; + sub->ringAlgo = NULL; break; } } @@ -1465,9 +1496,228 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct return ncclSuccess; } +ncclResult_t ncclNetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* handle) { + NCCLCHECK(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgDeregister, &handle, sizeof(void*), NULL, 0)); + INFO(NCCL_REG, "rank %d - deregistered net buffer handle %p", comm->rank, handle); + return ncclSuccess; +} + +static ncclResult_t netRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, struct ncclConnector** peerConns, int nPeers, struct ncclReg* regRecord, int* outRegBufFlag, void** outHandle) { + ncclResult_t ret = ncclSuccess; + int gdrFlag = 1; + + if (regRecord) { + for (int p = 0; p < nPeers; ++p) { + struct ncclConnector* peerConn = peerConns[p]; + struct ncclProxyConnector* peerProxyConn = NULL; + struct ncclRegNetHandles* netHandle = NULL; + bool found = false; + if (peerConn == NULL) continue; + peerProxyConn = &peerConn->proxyConn; + netHandle = regRecord->netHandleHead; + while (netHandle) { + if (netHandle->proxyConn == peerProxyConn) { + found = true; + break; + } + netHandle = netHandle->next; + } + if (found) { + *outRegBufFlag = 1; + outHandle[p] = netHandle->handle; + INFO(NCCL_REG, "rank %d - NET reuse buffer %p size %ld (baseAddr %p size %ld) handle %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, netHandle->handle); + } else { + struct netRegInfo info = { regRecord->addr, regRecord->pages * comm->regCache.pageSize }; + void* handle = NULL; + + if (peerConn->conn.flags & NCCL_DIRECT_NIC) { + NCCLCHECKGOTO(ncclProxyCallBlocking(comm, peerProxyConn, ncclProxyMsgRegister, &info, sizeof(struct netRegInfo), &handle, sizeof(void*)), ret, fail); + if (handle) { + struct ncclRegNetHandles* netHandle; + regRecord->state |= NET_REG_COMPLETE; + NCCLCHECK(ncclCalloc(&netHandle, 1)); + netHandle->handle = handle; + netHandle->proxyConn = peerProxyConn; + netHandle->next = regRecord->netHandleHead; + regRecord->netHandleHead = netHandle; + outHandle[p] = handle; + *outRegBufFlag = 1; + INFO(NCCL_REG, "rank %d - NET register userbuff %p (handle %p), buffSize %ld", comm->rank, userbuff, handle, buffSize); + } else { + goto fail; + } + } else { + gdrFlag = 0; + goto fail; + } + } + } + } + +exit: + return ret; +fail: + *outRegBufFlag = 0; + WARN("rank %d failed to NET register userbuff %p buffSize %ld GDR flag %d", comm->rank, userbuff, buffSize, gdrFlag); + goto exit; +} + +ncclResult_t ncclNetLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, struct ncclConnector** peerConns, int nPeers, int* outRegBufFlag, void** outHandle) { + ncclResult_t ret = ncclSuccess; + struct ncclReg *regRecord = NULL; + bool isValid = false; + + *outRegBufFlag = 0; + if (comm && userbuff && buffSize > 0 && nPeers > 0) { + NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, ®Record), ret, fail); + NCCLCHECKGOTO(ncclRegLocalIsValid(regRecord, &isValid), ret, fail); + if (isValid) + NCCLCHECKGOTO(netRegisterBuffer(comm, userbuff, buffSize, peerConns, nPeers, regRecord, outRegBufFlag, outHandle), ret, fail); + } + +exit: + return ret; +fail: + *outRegBufFlag = 0; + goto exit; +} + +struct ncclNetCleanupCallback { + struct ncclCommCallback base; + struct ncclComm *comm; + struct ncclReg *reg; +}; + +static ncclResult_t cleanupNet(struct ncclComm* comm, struct ncclCommCallback* cb) { + struct ncclNetCleanupCallback* obj = (struct ncclNetCleanupCallback*)cb; + NCCLCHECK(ncclCommGraphDeregister(obj->comm, obj->reg)); + free(obj); + return ncclSuccess; +} + +ncclResult_t ncclNetGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, struct ncclConnector** peerConns, int nPeers, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue* cleanupQueue, int* nCleanupQueueElts) { + ncclResult_t ret = ncclSuccess; + struct ncclNetCleanupCallback *record = NULL; + struct ncclReg *regRecord = NULL; + void *baseSend; + size_t baseSendSize; + + *outRegBufFlag = 0; + if (comm && userbuff && buffSize > 0 && nPeers > 0) { + CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)userbuff), ret, fail); + NCCLCHECKGOTO(ncclCommGraphRegister(comm, baseSend, baseSendSize, (void**)®Record), ret, fail); + NCCLCHECKGOTO(netRegisterBuffer(comm, userbuff, buffSize, peerConns, nPeers, regRecord, outRegBufFlag, outHandle), ret, fail); + if (*outRegBufFlag) { + NCCLCHECKGOTO(ncclCalloc(&record, 1), ret, fail); + record->base.fn = cleanupNet; + record->comm = comm; + record->reg = regRecord; + ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record); + if (nCleanupQueueElts) *nCleanupQueueElts += 1; + } else { + NCCLCHECKGOTO(ncclCommGraphDeregister(comm, regRecord), ret, fail); + } + } +exit: + return ret; +fail: + *outRegBufFlag = 0; + goto exit; +} + +static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + void* handle; + struct netRegInfo* info = (struct netRegInfo*)reqBuff; + struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources); + ncclResult_t ret = ncclSuccess; + bool needReg = true; + + assert(reqSize == sizeof(struct netRegInfo)); + assert(respSize == sizeof(void*)); + +#if CUDART_VERSION >= 11070 + /* DMA-BUF support */ + if (resources->useDmaBuf) { + int dmabuf_fd; + CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem); + NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem); + (void)close(dmabuf_fd); + needReg = false; + } +peermem: +#endif + if (needReg) { + NCCLCHECKGOTO(proxyState->ncclNet->regMr(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail); + } + +exit: + memcpy(respBuff, (void*)&handle, sizeof(void*)); + *done = 1; + return ncclSuccess; +fail: + handle = NULL; + goto exit; +} + +static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + void* handle; + struct netRegInfo* info = (struct netRegInfo*)reqBuff; + struct recvNetResources* resources = (struct recvNetResources*)(connection->transportResources); + ncclResult_t ret = ncclSuccess; + bool needReg = true; + + assert(reqSize == sizeof(struct netRegInfo)); + assert(respSize == sizeof(void*)); + +#if CUDART_VERSION >= 11070 + /* DMA-BUF support */ + if (resources->useDmaBuf) { + int dmabuf_fd; + CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem); + NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem); + (void)close(dmabuf_fd); + needReg = false; + } +peermem: +#endif + if (needReg) { + NCCLCHECKGOTO(proxyState->ncclNet->regMr(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail); + } + +exit: + memcpy(respBuff, (void*)&handle, sizeof(void*)); + *done = 1; + return ncclSuccess; +fail: + handle = NULL; + goto exit; +} + +static ncclResult_t sendProxyDeregBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) { + void* handle; + struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources); + + assert(reqSize == sizeof(void*)); + memcpy(&handle, reqBuff, sizeof(void*)); + NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, handle)); + *done = 1; + return ncclSuccess; +} + +static ncclResult_t recvProxyDeregBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) { + void* handle; + struct recvNetResources* resources = (struct recvNetResources*)(connection->transportResources); + + assert(reqSize == sizeof(void*)); + memcpy(&handle, reqBuff, sizeof(void*)); + NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, handle)); + *done = 1; + return ncclSuccess; +} + struct ncclTransport netTransport = { "NET", canConnect, - { sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, NULL }, - { recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, NULL } + { sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer }, + { recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer } }; diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index d828c9801..bc54133d3 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -42,14 +42,12 @@ struct ncclIbMrCache { }; static int ncclNMergedIbDevs = -1; -#define NCCL_IB_MAX_DEVS_PER_NIC 2 +#define NCCL_IB_MAX_DEVS_PER_NIC 4 #define MAX_MERGED_DEV_NAME (MAXNAMESIZE*NCCL_IB_MAX_DEVS_PER_NIC)+NCCL_IB_MAX_DEVS_PER_NIC struct alignas(64) ncclIbMergedDev { - int ndevs; - int devs[NCCL_IB_MAX_DEVS_PER_NIC]; // Points to an index in ncclIbDevs + ncclNetVDeviceProps_t vProps; int speed; char devName[MAX_MERGED_DEV_NAME]; // Up to NCCL_IB_MAX_DEVS_PER_NIC * name size, and a character for each '+' - int dmaBufSupported; // 0 = uninit, 1 = yes, -1 = no }; struct ncclIbStats { @@ -69,16 +67,20 @@ struct alignas(64) ncclIbDev { ibv_pd* pd; char devName[MAXNAMESIZE]; char* pciPath; + char* virtualPciPath; int realPort; int maxQp; + float latency; struct ncclIbMrCache mrCache; int ar; // ADAPTIVE_ROUTING struct ibv_port_attr portAttr; struct ncclIbStats stats; + int dmaBufSupported; }; -#define MAX_IB_DEVS 32 -struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_DEVS]; +#define MAX_IB_DEVS 32 +#define MAX_IB_VDEVS MAX_IB_DEVS*8 +struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_VDEVS]; struct ncclIbDev ncclIbDevs[MAX_IB_DEVS]; pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER; static int ncclIbRelaxedOrderingEnabled = 0; @@ -95,7 +97,7 @@ NCCL_PARAM(IbTc, "IB_TC", 0); NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192); NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2); NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2); -NCCL_PARAM(IbFifoTc, "IB_FIFO_TC", 0); +NCCL_PARAM(IbFifoTc, "IB_FIFO_TC", -1); NCCL_PARAM(IbAsyncEvents,"IB_RETURN_ASYNC_EVENTS",1); NCCL_PARAM(IbEceEnable,"IB_ECE_ENABLE",1); @@ -223,17 +225,17 @@ static void* envIbAddrRange(sa_family_t af, int* mask) { *(maskStrPtr++) = '\0'; if (inet_pton(af, addrStrPtr, ret) == 0) { - WARN("NET/IB: Ip address '%s' is invalid for family %s, ignoring address", addrStrPtr, (af == AF_INET) ? "AF_INET" : "AF_INET6"); + INFO(NCCL_INIT|NCCL_NET, "NET/IB: Ip address '%s' is invalid for family %s, ignoring address", addrStrPtr, (af == AF_INET) ? "AF_INET" : "AF_INET6"); return NULL; } *mask = (int)strtol(maskStrPtr, NULL, 10); if (af == AF_INET && *mask > 32) { - WARN("NET/IB: Ip address mask '%d' is invalid for family %s, ignoring mask", *mask, (af == AF_INET) ? "AF_INET" : "AF_INET6"); + INFO(NCCL_INIT|NCCL_NET, "NET/IB: Ip address mask '%d' is invalid for family %s, ignoring mask", *mask, (af == AF_INET) ? "AF_INET" : "AF_INET6"); *mask = 0; ret = NULL; } else if (af == AF_INET6 && *mask > 128) { - WARN("NET/IB: Ip address mask '%d' is invalid for family %s, ignoring mask", *mask, (af == AF_INET) ? "AF_INET" : "AF_INET6"); + INFO(NCCL_INIT|NCCL_NET, "NET/IB: Ip address mask '%d' is invalid for family %s, ignoring mask", *mask, (af == AF_INET) ? "AF_INET" : "AF_INET6"); *mask = 0; ret = NULL; } @@ -314,7 +316,7 @@ static bool validGid(union ibv_gid* gid) { static ncclResult_t ncclIbRoceGetVersionNum(const char* deviceName, int portNum, int gidIndex, int* version) { char gidRoceVerStr[16] = { 0 }; char roceTypePath[PATH_MAX] = { 0 }; - sprintf(roceTypePath, "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d", deviceName, portNum, gidIndex); + snprintf(roceTypePath, sizeof(roceTypePath), "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d", deviceName, portNum, gidIndex); int fd = open(roceTypePath, O_RDONLY); if (fd == -1) { @@ -423,6 +425,16 @@ NCCL_PARAM(IbDisable, "IB_DISABLE", 0); NCCL_PARAM(IbMergeVfs, "IB_MERGE_VFS", 1); NCCL_PARAM(IbMergeNics, "IB_MERGE_NICS", 1); +// Returns 0 if this is the path of two VFs of the same physical device +static int ncclIbMatchVfPath(char* path1, char* path2) { + // Merge multi-port NICs into the same PCI device + if (ncclParamIbMergeVfs()) { + return strncmp(path1, path2, strlen(path1)-4) == 0; + } else { + return strncmp(path1, path2, strlen(path1)-1) == 0; + } +} + static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort) { char devicePath[PATH_MAX]; snprintf(devicePath, PATH_MAX, "/sys/class/infiniband/%s/device", devName); @@ -430,14 +442,10 @@ static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort) if (p == NULL) { WARN("Could not find real path of %s (%s)", devName, devicePath); } else { - // Merge multi-port NICs into the same PCI device - p[strlen(p)-1] = '0'; - // Also merge virtual functions (VF) into the same device - if (ncclParamIbMergeVfs()) p[strlen(p)-3] = p[strlen(p)-4] = '0'; - // And keep the real port aside (the ibv port is always 1 on recent cards) + // Keep the real port aside (the ibv port is always 1 on recent cards) *realPort = 0; for (int d=0; dndevs > 1) { + WARN("NET/IB : Trying to merge multiple devices together when NCCL_IB_MERGE_NICS=0. Please enable it or disable device merging in NCCL."); + return ncclInvalidUsage; + } + + if (props->ndevs == 0) { + WARN("NET/IB : Can't make virtual NIC with 0 devices"); + return ncclInvalidUsage; + } + + if (ncclNMergedIbDevs == MAX_IB_VDEVS) { + WARN("NET/IB : Cannot allocate any more virtual devices (%d)", MAX_IB_VDEVS); + return ncclInvalidUsage; + } + + // Always count up number of merged devices + ncclIbMergedDev* mDev = ncclIbMergedDevs + ncclNMergedIbDevs; + mDev->vProps.ndevs = 0; + mDev->speed = 0; + + for (int i = 0; i < props->ndevs; i++) { + ncclIbDev* dev = ncclIbDevs + props->devs[i]; + if (mDev->vProps.ndevs == NCCL_IB_MAX_DEVS_PER_NIC) return ncclInvalidUsage; + mDev->vProps.devs[mDev->vProps.ndevs++] = props->devs[i]; + mDev->speed += dev->speed; + // Each successive time, copy the name '+' new name + if (mDev->vProps.ndevs > 1) { + snprintf(mDev->devName + strlen(mDev->devName), sizeof(mDev->devName) - strlen(mDev->devName), "+%s", dev->devName); + // First time, copy the plain name + } else { + strncpy(mDev->devName, dev->devName, MAXNAMESIZE); + } + } + + // Check link layers + ncclIbDev* dev0 = ncclIbDevs + props->devs[0]; + for (int i = 1; i < props->ndevs; i++) { + if (props->devs[i] >= ncclNIbDevs) { + WARN("NET/IB : Cannot use physical device %d, max %d", props->devs[i], ncclNIbDevs); + return ncclInvalidUsage; + } + ncclIbDev* dev = ncclIbDevs + props->devs[i]; + if (dev->link != dev0->link) { + WARN("NET/IB : Trying to merge multiple devices together with different link_layer properties %s -> %d, %s -> %d. Try only selecting NICs with one type of link using NCCL_IB_HCA", + dev0->devName, dev0->link, dev->devName, dev->link); + return ncclInvalidUsage; } } - return ncclNMergedIbDevs; + *d = ncclNMergedIbDevs++; + INFO(NCCL_NET, "NET/IB : Made virtual device [%d] name=%s speed=%d ndevs=%d", *d, mDev->devName, mDev->speed, mDev->vProps.ndevs); + return ncclSuccess; +} + +ncclResult_t ncclIbMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { + pthread_mutex_lock(&ncclIbLock); + ncclResult_t res = ncclIbMakeVDeviceInternal(d, props); + pthread_mutex_unlock(&ncclIbLock); + return res; } ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { @@ -531,10 +582,6 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) { ret = ncclInternalError; goto fail; } - // Should NCCL merge multi-port devices into one? - int mergeNics; - mergeNics = ncclParamIbMergeNics(); -build_ib_list: for (int d=0; dndevs > 1) { - // Print out merged dev info - snprintf(line+strlen(line), 2047-strlen(line), " [%d]={", d); - for (int i = 0; i < mergedDev->ndevs; i++) { - int ibDev = mergedDev->devs[i]; - snprintf(line+strlen(line), 2047-strlen(line), "[%d] %s:%d/%s%s", ibDev, ncclIbDevs[ibDev].devName, - ncclIbDevs[ibDev].portNum, ncclIbDevs[ibDev].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE", - // Insert comma to delineate - i == (mergedDev->ndevs - 1) ? "" : ", "); - } - snprintf(line+strlen(line), 2047-strlen(line), "}"); - } else { - int ibDev = mergedDev->devs[0]; - snprintf(line+strlen(line), 2047-strlen(line), " [%d]%s:%d/%s", ibDev, ncclIbDevs[ibDev].devName, - ncclIbDevs[ibDev].portNum, ncclIbDevs[ibDev].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); - } - } - line[2047] = '\0'; - char addrline[SOCKET_NAME_MAXLEN+1]; - INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "", - ncclIbIfName, ncclSocketToString(&ncclIbIfAddr, addrline)); } + + // Print out all net devices to the user (in the same format as before) + char line[2048]; + line[0] = '\0'; + // Determine whether RELAXED_ORDERING is enabled and possible + ncclIbRelaxedOrderingEnabled = ncclIbRelaxedOrderingCapable(); + for (int d = 0; d < ncclNIbDevs; d++) { + snprintf(line+strlen(line), sizeof(line)-strlen(line), " [%d]%s:%d/%s", d, ncclIbDevs[d].devName, + ncclIbDevs[d].portNum, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); + } + char addrline[SOCKET_NAME_MAXLEN+1]; + INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "", + ncclIbIfName, ncclSocketToString(&ncclIbIfAddr, addrline)); + pthread_mutex_unlock(&ncclIbLock); } exit: @@ -706,27 +709,25 @@ ncclResult_t ncclIbGdrSupport() { static __thread int ibDmaSupportInitDev; // which device to init, must be thread local static void ibDmaBufSupportInitOnce(){ ncclResult_t res; - // select the appropriate - struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + ibDmaSupportInitDev; - // Test each real devices int dev_fail = 0; - for (int i = 0; i < mergedDev->ndevs; i++) { - int ibDev = mergedDev->devs[i]; - struct ibv_pd* pd; - struct ibv_context* ctx = ncclIbDevs[ibDev].context; - NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure); - // Test kernel DMA-BUF support with a dummy call (fd=-1) - (void)wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL /*offset*/, 0ULL /*len*/, 0ULL /*iova*/, -1 /*fd*/, 0 /*flags*/); - // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise) - dev_fail |= (errno == EOPNOTSUPP) || (errno == EPROTONOSUPPORT); - NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure); - // stop the search and goto failure - if (dev_fail) goto failure; - } - mergedDev->dmaBufSupported = 1; + + // This is a physical device, not a virtual one, so select from ibDevs + ncclIbMergedDev* mergedDev = ncclIbMergedDevs + ibDmaSupportInitDev; + ncclIbDev* ibDev = ncclIbDevs + mergedDev->vProps.devs[0]; + struct ibv_pd* pd; + struct ibv_context* ctx = ibDev->context; + NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure); + // Test kernel DMA-BUF support with a dummy call (fd=-1) + (void)wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL /*offset*/, 0ULL /*len*/, 0ULL /*iova*/, -1 /*fd*/, 0 /*flags*/); + // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise) + dev_fail |= (errno == EOPNOTSUPP) || (errno == EPROTONOSUPPORT); + NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure); + // stop the search and goto failure + if (dev_fail) goto failure; + ibDev->dmaBufSupported = 1; return; failure: - mergedDev->dmaBufSupported = -1; + ibDev->dmaBufSupported = -1; return; } // Detect whether DMA-BUF support is present in the kernel @@ -741,21 +742,20 @@ ncclResult_t ncclIbDmaBufSupport(int dev) { // init the device only once ibDmaSupportInitDev = dev; pthread_once(&onces[dev].once, ibDmaBufSupportInitOnce); - - int dmaBufSupported = ncclIbMergedDevs[dev].dmaBufSupported; + ncclIbMergedDev* mergedDev = ncclIbMergedDevs + ibDmaSupportInitDev; + ncclIbDev* ibDev = ncclIbDevs + mergedDev->vProps.devs[0]; + int dmaBufSupported = ibDev->dmaBufSupported; if (dmaBufSupported == 1) return ncclSuccess; return ncclSystemError; } #define NCCL_NET_IB_MAX_RECVS 8 -ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) { - struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs+dev; - props->name = mergedDev->devName; - props->speed = mergedDev->speed; - - // Take the rest of the properties from an arbitrary sub-device (should be the same) - struct ncclIbDev* ibDev = ncclIbDevs + mergedDev->devs[0]; +ncclResult_t ncclIbGetPhysProperties(int dev, ncclNetProperties_t* props) { + struct ncclIbDev* ibDev = ncclIbDevs + dev; + pthread_mutex_lock(&ibDev->lock); + props->name = ibDev->devName; + props->speed = ibDev->speed; props->pciPath = ibDev->pciPath; props->guid = ibDev->guid; props->ptrSupport = NCCL_PTR_HOST; @@ -766,12 +766,29 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) { if (ncclIbDmaBufSupport(dev) == ncclSuccess) { props->ptrSupport |= NCCL_PTR_DMABUF; // GDR support via DMA-BUF } + props->forceFlush = 0; props->latency = 0; // Not set props->port = ibDev->portNum + ibDev->realPort; props->maxComms = ibDev->maxQp; props->maxRecvs = NCCL_NET_IB_MAX_RECVS; props->netDeviceType = NCCL_NET_DEVICE_HOST; props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; + props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES; + pthread_mutex_unlock(&ibDev->lock); + return ncclSuccess; +} + +ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) { + if (dev >= ncclNMergedIbDevs) { + WARN("NET/IB : Requested properties for vNic %d, only %d vNics have been created", dev, ncclNMergedIbDevs); + return ncclInvalidUsage; + } + struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + dev; + // Take the rest of the properties from an arbitrary sub-device (should be the same) + NCCLCHECK(ncclIbGetPhysProperties(mergedDev->vProps.devs[0], props)); + props->name = mergedDev->devName; + props->speed = mergedDev->speed; + memcpy(&props->vProps, &mergedDev->vProps, sizeof(ncclNetVDeviceProps_t)); return ncclSuccess; } @@ -826,6 +843,8 @@ enum ncclIbCommState { ncclIbCommStateConnecting = 6, ncclIbCommStateConnected = 7, ncclIbCommStatePendingReady = 8, + ncclIbCommStateSendDevList = 9, + ncclIbCommStateRecvDevList = 10, }; struct ncclIbCommStage { @@ -890,12 +909,12 @@ struct ncclIbListenComm { struct ncclIbSendFifo { uint64_t addr; - int size; + uint64_t size; uint32_t rkeys[NCCL_IB_MAX_DEVS_PER_NIC]; uint32_t nreqs; uint32_t tag; uint64_t idx; - char padding[24]; + char padding[16]; }; struct ncclIbQp { @@ -927,7 +946,7 @@ struct ncclIbMrHandle { }; struct alignas(32) ncclIbNetCommBase { - int ndevs; + ncclNetVDeviceProps_t vProps; bool isSend; struct ncclIbRequest reqs[MAX_REQUESTS]; struct ncclIbQp qps[NCCL_IB_MAX_QPS]; @@ -938,6 +957,7 @@ struct alignas(32) ncclIbNetCommBase { int ready; // Track necessary remDevInfo here int nRemDevs; + int nDataQps; struct ncclIbDevInfo remDevs[NCCL_IB_MAX_DEVS_PER_NIC]; // statistics about the comm struct ncclIbStats stats; @@ -981,7 +1001,6 @@ struct ncclIbRemFifo { struct alignas(16) ncclIbRecvCommDev { struct ncclIbNetCommDevBase base; struct ncclIbGpuFlush gpuFlush; - uint32_t fifoRkey; struct ibv_mr* fifoMr; struct ibv_sge fifoSge; struct ibv_mr* sizesFifoMr; @@ -989,7 +1008,7 @@ struct alignas(16) ncclIbRecvCommDev { struct ncclIbRecvComm { struct ncclIbNetCommBase base; - struct ncclIbRecvCommDev devs[NCCL_IB_MAX_DEVS_PER_NIC]; + struct ncclIbRecvCommDev devs[NCCL_IB_MAX_DEVS_PER_NIC]; struct ncclIbRemFifo remFifo; int sizesFifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; int gpuFlushHostMem; @@ -1060,10 +1079,12 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, qpAttr.port_num = ib_port; qpAttr.qp_access_flags = access_flags; NCCLCHECK(wrap_ibv_modify_qp(qp->qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)); + TRACE(NCCL_NET, "NET/IB : ncclIbCreateQp port=%d dev=%d devName=%s ndevs=%d nmdevs=%d qpn=%u pkey=%u pd=%p", + ib_port, base->ibDevN, ncclIbDevs[base->ibDevN].devName, ncclNIbDevs, ncclNMergedIbDevs, qp->qp->qp_num, qpAttr.pkey_index, base->pd); return ncclSuccess; } -ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint32_t dest_qp_num, struct ncclIbDevInfo* info, bool override_tc) { +ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint32_t dest_qp_num, struct ncclIbDevInfo* info, bool fifoTc) { struct ibv_qp_attr qpAttr; memset(&qpAttr, 0, sizeof(struct ibv_qp_attr)); qpAttr.qp_state = IBV_QPS_RTR; @@ -1079,11 +1100,7 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint qpAttr.ah_attr.grh.flow_label = 0; qpAttr.ah_attr.grh.sgid_index = sGidInfo->localGidIndex; qpAttr.ah_attr.grh.hop_limit = 255; - if(ncclParamIbFifoTc() && override_tc) { - qpAttr.ah_attr.grh.traffic_class = ncclParamIbFifoTc(); - } else { - qpAttr.ah_attr.grh.traffic_class = ncclParamIbTc(); - } + qpAttr.ah_attr.grh.traffic_class = fifoTc && ncclParamIbFifoTc() != -1 ? ncclParamIbFifoTc() : ncclParamIbTc(); } else { //pick lid if subnet prefixs are same, FLID if they are not if (ncclIbExtractLocalSubnetPrefix(sGidInfo->localGid.global.subnet_prefix) == @@ -1108,6 +1125,7 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint qpAttr.ah_attr.sl = ncclParamIbSl(); qpAttr.ah_attr.src_path_bits = 0; qpAttr.ah_attr.port_num = info->ib_port; + TRACE(NCCL_NET, "NET/IB : ncclIbRtrQp qpn=%u mtu=%d dst=%u ll=%u port=%u", qp->qp_num, info->mtu, dest_qp_num, info->link_layer, info->ib_port); NCCLCHECK(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)); return ncclSuccess; } @@ -1154,10 +1172,12 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet int ready; *sendComm = NULL; - if (stage->state == ncclIbCommStateConnect) goto ib_connect_check; - if (stage->state == ncclIbCommStateSend) goto ib_send; - if (stage->state == ncclIbCommStateConnecting) goto ib_connect; - if (stage->state == ncclIbCommStateConnected) goto ib_send_ready; + if (stage->state == ncclIbCommStateConnect) goto ib_connect_check; + if (stage->state == ncclIbCommStateSendDevList) goto ib_send_dev_list; + if (stage->state == ncclIbCommStateRecvDevList) goto ib_recv_dev_list; + if (stage->state == ncclIbCommStateSend) goto ib_send; + if (stage->state == ncclIbCommStateConnecting) goto ib_connect; + if (stage->state == ncclIbCommStateConnected) goto ib_send_ready; if (stage->state != ncclIbCommStateStart) { WARN("Error: trying to connect already connected sendComm"); return ncclInternalError; @@ -1178,21 +1198,51 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet // IB Setup struct ncclIbMergedDev* mergedDev; + if (dev >= ncclNMergedIbDevs) { + WARN("NET/IB : Trying to use non-existant virtual device %d", dev); + return ncclInternalError; + } + mergedDev = ncclIbMergedDevs + dev; - comm->base.ndevs = mergedDev->ndevs; - comm->base.nqps = ncclParamIbQpsPerConn() * comm->base.ndevs; // We must have at least 1 qp per-device + comm->base.vProps = mergedDev->vProps; comm->base.isSend = true; + stage->state = ncclIbCommStateSendDevList; + stage->offset = 0; + struct ncclIbConnectionMetadata meta; + NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(meta)), ret, fail); + memcpy(stage->buffer, &mergedDev->vProps, sizeof(ncclNetVDeviceProps_t)); + +// In the case of mismatched nDevs, we will make sure that both sides of a logical connection have the same number of RC qps +ib_send_dev_list: + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, stage->buffer, sizeof(ncclNetVDeviceProps_t), &stage->offset)); + if (stage->offset != sizeof(ncclNetVDeviceProps_t)) return ncclSuccess; + + stage->state = ncclIbCommStateRecvDevList; + stage->offset = 0; + +ib_recv_dev_list: + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->base.sock, stage->buffer, sizeof(ncclNetVDeviceProps_t), &stage->offset)); + if (stage->offset != sizeof(ncclNetVDeviceProps_t)) return ncclSuccess; + stage->offset = 0; + ncclNetVDeviceProps_t remoteVProps; + memcpy(&remoteVProps, stage->buffer, sizeof(ncclNetVDeviceProps_t)); + mergedDev = ncclIbMergedDevs + dev; + comm->base.vProps = mergedDev->vProps; + int localNqps, remoteNqps; + localNqps = ncclParamIbQpsPerConn() * comm->base.vProps.ndevs; // We must have at least 1 qp per-device + remoteNqps = ncclParamIbQpsPerConn() * remoteVProps.ndevs; + comm->base.nqps = remoteNqps > localNqps ? remoteNqps : localNqps; // Select max nqps (local or remote) // Init PD, Ctx for each IB device comm->ar = 1; // Set to 1 for logic - for (int i = 0; i < mergedDev->ndevs; i++) { - int ibDevN = mergedDev->devs[i]; + for (int i = 0; i < comm->base.vProps.ndevs; i++) { + int ibDevN = comm->base.vProps.devs[i]; NCCLCHECKGOTO(ncclIbInitCommDevBase(ibDevN, &comm->devs[i].base, &comm->base.stats), ret, fail); - comm->ar = comm->ar && ncclIbDevs[dev].ar; // ADAPTIVE_ROUTING - if all merged devs have it enabled + comm->ar = comm->ar && ncclIbDevs[ibDevN].ar; // ADAPTIVE_ROUTING - if all merged devs have it enabled } - struct ncclIbConnectionMetadata meta; - meta.ndevs = comm->base.ndevs; + memset(&meta, 0, sizeof(meta)); + meta.ndevs = comm->base.vProps.ndevs; // Alternate QPs between devices int devIndex; @@ -1211,10 +1261,10 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet } else { meta.qpInfo[q].ece_supported = 0; } - devIndex = (devIndex + 1) % comm->base.ndevs; + devIndex = (devIndex + 1) % comm->base.vProps.ndevs; } - for (int i = 0; i < comm->base.ndevs; i++) { + for (int i = 0; i < comm->base.vProps.ndevs; i++) { ncclIbSendCommDev* commDev = comm->devs + i; ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN; @@ -1241,7 +1291,7 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet // Print just the QPs for this dev if (comm->base.qps[q].devIndex == i) INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d LID %d subnet-prefix %lu FLID %d fifoRkey=0x%x fifoLkey=0x%x", - comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", + comm->base.vProps.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, devInfo->lid, devInfo->gid.global.subnet_prefix, ncclIbExtractFlid(&devInfo->gid), devInfo->fifoRkey, commDev->fifoMr->lkey); } @@ -1250,7 +1300,7 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet // Print just the QPs for this dev if (comm->base.qps[q].devIndex == i) INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x} GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x", - comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev, + comm->base.vProps.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask, (int64_t)commDev->base.gidInfo.localGidIndex, devInfo->gid.global.subnet_prefix, devInfo->gid.global.interface_id, devInfo->fifoRkey, commDev->fifoMr->lkey); } @@ -1261,7 +1311,6 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet stage->state = ncclIbCommStateSend; stage->offset = 0; - NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(meta)), ret, fail); memcpy(stage->buffer, &meta, sizeof(meta)); @@ -1282,17 +1331,12 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet memcpy(&remMeta, stage->buffer, sizeof(ncclIbConnectionMetadata)); comm->base.nRemDevs = remMeta.ndevs; - if (comm->base.nRemDevs != comm->base.ndevs) { - mergedDev = ncclIbMergedDevs + dev; - WARN("NET/IB : Local mergedDev=%s has a different number of devices=%d as remoteDev=%s nRemDevs=%d", - mergedDev->devName, comm->base.ndevs, remMeta.devName, comm->base.nRemDevs); - } int link_layer; link_layer = remMeta.devs[0].link_layer; for (int i = 1; i < remMeta.ndevs; i++) { if (remMeta.devs[i].link_layer != link_layer) { - WARN("NET/IB : Can't merge net devices with different link_layer. i=%d remMeta.ndevs=%d link_layer=%d rem_link_layer=%d", + WARN("NET/IB : Can't connect net devices with different link_layer. i=%d remMeta.ndevs=%d link_layer=%d rem_link_layer=%d", i, remMeta.ndevs, link_layer, remMeta.devs[i].link_layer); return ncclInternalError; } @@ -1309,7 +1353,7 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet comm->remSizesFifo.addr = remMeta.fifoAddr; } - for (int i=0; i < comm->base.ndevs; i++) { + for (int i=0; i < comm->base.vProps.ndevs; i++) { NCCLCHECKGOTO(wrap_ibv_reg_mr(comm->remSizesFifo.mrs+i, comm->devs[i].base.pd, &comm->remSizesFifo.elems, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail); } comm->base.nRemDevs = remMeta.ndevs; @@ -1327,6 +1371,8 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet if (remQpInfo->ece_supported) NCCLCHECKGOTO(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported), ret, fail); + ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN; + remDevInfo->mtu = std::min(remDevInfo->mtu, ibDev->portAttr.active_mtu); NCCLCHECKGOTO(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false), ret, fail); NCCLCHECKGOTO(ncclIbRtsQp(qp), ret, fail); } @@ -1341,6 +1387,8 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet } } + comm->base.nDataQps = std::max(comm->base.vProps.ndevs, comm->base.nRemDevs); + comm->base.ready = 1; stage->state = ncclIbCommStateConnected; stage->offset = 0; @@ -1359,6 +1407,50 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet goto exit; } +NCCL_PARAM(IbWarnRailLocal, "IB_WARN_RAIL_LOCAL", 0); + +ncclResult_t ncclIbCheckVProps(ncclNetVDeviceProps_t* vProps1, ncclNetVDeviceProps_t* vProps2) { + ncclNetVDeviceProps_t outVProps = {0}; + ncclNetVDeviceProps_t* minVProps = vProps2; + ncclNetVDeviceProps_t* maxVProps = vProps1; + if (vProps2->ndevs > vProps1->ndevs) { + minVProps = vProps1; + maxVProps = vProps2; + } + + // Find the intersection of devices + for (int i = 0; i < minVProps->ndevs; i++) { + int dev = minVProps->devs[i]; + for (int j = 0; j < maxVProps->ndevs; j++) { + // Found + if (maxVProps->devs[j] == dev) { + outVProps.devs[outVProps.ndevs++] = dev; + } + } + } + + // In the case that at least one side has a fused NIC but there are no matching physical NICs, we should check if the user wants this + if (ncclParamIbWarnRailLocal() && outVProps.ndevs < maxVProps->ndevs) { + char local[128]; + int cursor = 1; + snprintf(local, sizeof(local), "%d", vProps1->devs[0]); + for (int i = 1; i < vProps1->ndevs; i++) { + snprintf(local+cursor, sizeof(local)-cursor, ",%d", vProps1->devs[i]); + cursor += 2; + } + char remote[128]; + snprintf(remote, sizeof(remote), "%d", vProps2->devs[0]); + cursor = 1; + for (int i = 1; i < vProps2->ndevs; i++) { + snprintf(remote+cursor, sizeof(remote)-cursor, ",%d", vProps2->devs[i]); + cursor += 2; + } + INFO(NCCL_NET, "NET/IB : There are mismatched physical devices between local (%s) and remote (%s). To disable this warning, set NCCL_IB_WARN_RAIL_LOCAL=0", local, remote); + } + + return ncclSuccess; +} + NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0); ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { @@ -1369,7 +1461,9 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle int ready; *recvComm = NULL; - if (stage->state == ncclIbCommStateAccept) goto ib_accept_check; + if (stage->state == ncclIbCommStateAccept) goto ib_accept_check; + if (stage->state == ncclIbCommStateRecvDevList) goto ib_recv_dev_list; + if (stage->state == ncclIbCommStateSendDevList) goto ib_send_dev_list; if (stage->state == ncclIbCommStateRecv) goto ib_recv; if (stage->state == ncclIbCommStateSend) goto ib_send; if (stage->state == ncclIbCommStatePendingReady) goto ib_recv_ready; @@ -1385,14 +1479,49 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle NCCLCHECKGOTO(ncclSocketInit(&rComm->base.sock), ret, fail); NCCLCHECKGOTO(ncclSocketAccept(&rComm->base.sock, &lComm->sock), ret, fail); + // Alloc stage->buffer here to be used for all following steps + struct ncclIbConnectionMetadata remMeta; + stage->offset = 0; + NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remMeta))); + ib_accept_check: NCCLCHECKGOTO(ncclSocketReady(&rComm->base.sock, &ready), ret, fail); if (!ready) return ncclSuccess; + stage->state = ncclIbCommStateRecvDevList; + stage->offset = 0; + +// In the case of mismatched nDevs, we will make sure that both sides of a logical connection have the same number of RC qps +ib_recv_dev_list: + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, stage->buffer, sizeof(ncclNetVDeviceProps_t), &stage->offset)); + if (stage->offset != sizeof(ncclNetVDeviceProps_t)) return ncclSuccess; + ncclNetVDeviceProps_t remoteVProps; + memcpy(&remoteVProps, stage->buffer, sizeof(ncclNetVDeviceProps_t)); + if (lComm->dev >= ncclNMergedIbDevs) { + WARN("NET/IB : Trying to use non-existant virtual device %d", lComm->dev); + return ncclInternalError; + } + + // Reduce the physical device list and store in the connection base + struct ncclIbMergedDev* mergedDev; + mergedDev = ncclIbMergedDevs + lComm->dev; + NCCLCHECK(ncclIbCheckVProps(&mergedDev->vProps, &remoteVProps)); + rComm->base.vProps = mergedDev->vProps; + memcpy(stage->buffer, &rComm->base.vProps, sizeof(ncclNetVDeviceProps_t)); + rComm->base.isSend = false; + int localNqps, remoteNqps; + localNqps = ncclParamIbQpsPerConn() * rComm->base.vProps.ndevs; // We must have at least 1 qp per-device + remoteNqps = ncclParamIbQpsPerConn() * remoteVProps.ndevs; + rComm->base.nqps = remoteNqps > localNqps ? remoteNqps : localNqps; // Select max nqps (local or remote) - struct ncclIbConnectionMetadata remMeta; - stage->state = ncclIbCommStateRecv; stage->offset = 0; - NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(remMeta)), ret, fail); + stage->state = ncclIbCommStateSendDevList; + +ib_send_dev_list: + NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->base.sock, stage->buffer, sizeof(ncclNetVDeviceProps_t), &stage->offset), ret, fail); + if (stage->offset != sizeof(ncclNetVDeviceProps_t)) return ncclSuccess; + + stage->offset = 0; + stage->state = ncclIbCommStateRecv; ib_recv: NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, stage->buffer, sizeof(remMeta), &stage->offset), ret, fail); @@ -1403,7 +1532,6 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle // IB setup // Pre-declare variables because of goto - struct ncclIbMergedDev* mergedDev; struct ncclIbDev* ibDev; int ibDevN; struct ncclIbRecvCommDev* rCommDev; @@ -1411,21 +1539,18 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle struct ncclIbQp* qp; mergedDev = ncclIbMergedDevs + lComm->dev; - rComm->base.ndevs = mergedDev->ndevs; - rComm->base.nqps = ncclParamIbQpsPerConn() * rComm->base.ndevs; // We must have at least 1 qp per-device - rComm->base.isSend = false; - rComm->base.nRemDevs = remMeta.ndevs; - if (rComm->base.nRemDevs != rComm->base.ndevs) { - WARN("NET/IB : Local mergedDev %s has a different number of devices=%d as remote %s %d", - mergedDev->devName, rComm->base.ndevs, remMeta.devName, rComm->base.nRemDevs); + if (rComm->base.nRemDevs != rComm->base.vProps.ndevs) { + INFO(NCCL_NET, "NET/IB : Local mergedDev %s has a different number of devices=%d as remote %s %d", + mergedDev->devName, rComm->base.vProps.ndevs, remMeta.devName, rComm->base.nRemDevs); } // Metadata to send back to requestor (sender) struct ncclIbConnectionMetadata meta; - for (int i = 0; i < rComm->base.ndevs; i++) { + memset(&meta, 0, sizeof(meta)); + for (int i = 0; i < rComm->base.vProps.ndevs; i++) { rCommDev = rComm->devs + i; - ibDevN = mergedDev->devs[i]; + ibDevN = rComm->base.vProps.devs[i]; NCCLCHECKGOTO(ncclIbInitCommDevBase(ibDevN, &rCommDev->base, &rComm->base.stats), ret, fail); ibDev = ncclIbDevs + ibDevN; NCCLCHECKGOTO(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex), ret, fail); @@ -1456,7 +1581,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle ibDev = ncclIbDevs + ibDevN; NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, qp), ret, fail); qp->devIndex = devIndex; - devIndex = (devIndex + 1) % rComm->base.ndevs; + devIndex = (devIndex + 1) % rComm->base.vProps.ndevs; // Set the ece (enhanced connection establishment) on this QP before RTR if (remMeta.qpInfo[q].ece_supported) { @@ -1469,23 +1594,22 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle // Store this in our own qpInfo for returning to the requestor if (meta.qpInfo[q].ece_supported) NCCLCHECKGOTO(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail); + } else { + meta.qpInfo[q].ece_supported = 0; } - bool override_tc = (q == 0) ? true : false; - NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, override_tc), ret, fail); + NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, true), ret, fail); NCCLCHECKGOTO(ncclIbRtsQp(qp->qp), ret, fail); } rComm->flushEnabled = ((ncclIbGdrSupport() == ncclSuccess || ncclIbDmaBufSupport(lComm->dev) == ncclSuccess) && (ncclParamIbGdrFlushDisable() == 0)) ? 1 : 0; - for (int i = 0; i < mergedDev->ndevs; i++) { + for (int i = 0; i < rComm->base.vProps.ndevs; i++) { rCommDev = rComm->devs + i; - ibDevN = rCommDev->base.ibDevN; - ibDev = ncclIbDevs + ibDevN; + ibDev = ncclIbDevs + rCommDev->base.ibDevN; // Retain remote fifo info and prepare my RDMA ops - rCommDev->fifoRkey = remMeta.devs[i].fifoRkey; rComm->remFifo.addr = remMeta.fifoAddr; NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail); rCommDev->fifoSge.lkey = rCommDev->fifoMr->lkey; @@ -1510,15 +1634,12 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle } // Fill Handle - meta.devs[i].lid = ibDev->portAttr.lid; - meta.devs[i].link_layer = rCommDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer; - meta.devs[i].ib_port = ibDev->portNum; + meta.devs[i].lid = ibDev->portAttr.lid; + meta.devs[i].link_layer = rCommDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer; + meta.devs[i].ib_port = ibDev->portNum; meta.devs[i].gid.global.subnet_prefix = rCommDev->base.gidInfo.localGid.global.subnet_prefix; meta.devs[i].gid.global.interface_id = rCommDev->base.gidInfo.localGid.global.interface_id; - - // Adjust the MTU - remMeta.devs[i].mtu = (enum ibv_mtu) std::min(remMeta.devs[i].mtu, ibDev->portAttr.active_mtu); - meta.devs[i].mtu = remMeta.devs[i].mtu; + meta.devs[i].mtu = ibDev->portAttr.active_mtu; // Prepare sizes fifo NCCLCHECKGOTO(wrap_ibv_reg_mr(&rComm->devs[i].sizesFifoMr, rComm->devs[i].base.pd, rComm->sizesFifo, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail); @@ -1530,9 +1651,9 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle meta.qpInfo[q].qpn = rComm->base.qps[q].qp->qp_num; meta.qpInfo[q].devIndex = rComm->base.qps[q].devIndex; } - - meta.ndevs = rComm->base.ndevs; + meta.ndevs = rComm->base.vProps.ndevs; strncpy(meta.devName, mergedDev->devName, MAX_MERGED_DEV_NAME); + rComm->base.nDataQps = std::max(rComm->base.vProps.ndevs, rComm->base.nRemDevs); stage->state = ncclIbCommStateSend; stage->offset = 0; @@ -1662,7 +1783,7 @@ ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, ui assert(size > 0); struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm; struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) malloc(sizeof(struct ncclIbMrHandle)); - for (int i = 0; i < base->ndevs; i++) { + for (int i = 0; i < base->vProps.ndevs; i++) { // Each ncclIbNetCommDevBase is at different offset in send and recv netComms struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i); NCCLCHECKGOTO(ncclIbRegMrDmaBufInternal(devComm, data, size, type, offset, fd, mhandleWrapper->mrs + i), ret, fail); @@ -1706,9 +1827,11 @@ ncclResult_t ncclIbDeregMrInternal(ncclIbNetCommDevBase* base, ibv_mr* mhandle) } ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) { + if (mhandle == NULL) return ncclSuccess; + struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle; struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm; - for (int i = 0; i < base->ndevs; i++) { + for (int i = 0; i < base->vProps.ndevs; i++) { // Each ncclIbNetCommDevBase is at different offset in send and recv netComms struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i); NCCLCHECK(ncclIbDeregMrInternal(devComm, mhandleWrapper->mrs[i])); @@ -1773,7 +1896,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { // Multi-QP: make sure IB writes are multiples of 128B so that LL and LL128 protocols still work const int align = 128; - int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.ndevs; + int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.nDataQps; for (int i = 0; i < nqps; i++) { int qpIndex = comm->base.qpIndex; ncclIbQp* qp = comm->base.qps + qpIndex; @@ -1822,7 +1945,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { return ncclSuccess; } -ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { +ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm; if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0"); return ncclInternalError; } if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; } @@ -1852,7 +1975,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh char line[SOCKET_NAME_MAXLEN + 1]; union ncclSocketAddress addr; ncclSocketGetAddr(&comm->base.sock, &addr); - WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %d addr %lx rkeys[0]=%x", + WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %ld addr %lx rkeys[0]=%x", r, nreqs, tag, ncclSocketToString(&addr, line), slots[r].size, slots[r].addr, slots[r].rkeys[0]); return ncclInternalError; } @@ -1868,7 +1991,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh req->send.offset = 0; // Populate events - int nEvents = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.ndevs; + int nEvents = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.nDataQps; int qpIndex = comm->base.qpIndex; // Count down while (nEvents > 0) { @@ -1883,7 +2006,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh } // Store all lkeys - for (int i = 0; i < comm->base.ndevs; i++) { + for (int i = 0; i < comm->base.vProps.ndevs; i++) { req->send.lkeys[i] = mhandleWrapper->mrs[i]->lkey; } @@ -1909,7 +2032,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh return ncclSuccess; } -ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int* sizes, int* tags, void** mhandles, struct ncclIbRequest* req) { +ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, size_t* sizes, int* tags, void** mhandles, struct ncclIbRequest* req) { struct ibv_send_wr wr; memset(&wr, 0, sizeof(wr)); @@ -1921,14 +2044,14 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int // Select the next devIndex (local) and QP to use for posting this CTS message // Since QPs are initialized by striping across devIndex, we can simply assign this to the same value ncclIbQp* ctsQp = comm->base.qps + comm->base.devIndex; - comm->base.devIndex = (comm->base.devIndex + 1) % comm->base.ndevs; + comm->base.devIndex = (comm->base.devIndex + 1) % comm->base.vProps.ndevs; for (int i=0; ibase.ndevs; j++) + for (int j = 0; j < comm->base.vProps.ndevs; j++) localElem[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey; localElem[i].nreqs = n; @@ -1986,7 +2109,7 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int return ncclSuccess; } -ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { +ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0"); return ncclInternalError; } if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; } @@ -1999,7 +2122,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta req->sock = &comm->base.sock; req->nreqs = n; - for (int i = 0; i < comm->base.ndevs; i++) { + for (int i = 0; i < comm->base.vProps.ndevs; i++) { req->devBases[i] = &comm->devs[i].base; } @@ -2011,7 +2134,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta TIME_START(1); // Select either all QPs, or one qp per-device - const int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.ndevs; + const int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.nDataQps; // Post recvs struct ibv_recv_wr* bad_wr; @@ -2047,7 +2170,7 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void** struct ncclIbMrHandle* mhandle = (struct ncclIbMrHandle*) mhandles[last]; // We don't know which devIndex the recv was on, so we flush on all devices - for (int i = 0; i < comm->base.ndevs; i++) { + for (int i = 0; i < comm->base.vProps.ndevs; i++) { struct ibv_send_wr wr; memset(&wr, 0, sizeof(wr)); wr.wr_id = req - comm->base.reqs; @@ -2078,7 +2201,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { *done = 0; while (1) { NCCLCHECK(ncclIbStatsCheckFatalCount(&r->base->stats,__func__)); - if (r->events[0] == 0 && r->events[1] == 0) { + if (r->events[0] == 0 && r->events[1] == 0 && r->events[2] == 0 && r->events[3] == 0) { TRACE(NCCL_NET, "r=%p done", r); *done = 1; if (sizes && r->type == NCCL_NET_IB_REQ_RECV) { @@ -2112,13 +2235,13 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { char remoteGidString[INET6_ADDRSTRLEN] = ""; const char* localGidStr = NULL, *remoteGidStr = NULL; if (r->devBases[i]->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) { - localGidStr = inet_ntop(AF_INET6, &r->devBases[i]->gidInfo.localGid, localGidString, sizeof(localGidString)); - remoteGidStr = inet_ntop(AF_INET6, &r->base->remDevs[i].remoteGid, remoteGidString, sizeof(remoteGidString)); + localGidStr = ibvGetGidStr(&r->devBases[i]->gidInfo.localGid, localGidString, sizeof(localGidString)); + remoteGidStr = ibvGetGidStr(&r->base->remDevs[i].remoteGid, remoteGidString, sizeof(remoteGidString)); } char line[SOCKET_NAME_MAXLEN+1]; char *hcaName = r->devBases[i]->pd->context->device->name; - WARN("NET/IB: Got completion from peer %s with status=%d opcode=%d len=%d vendor err %d (%s)%s%s%s%s hca %s", + WARN("NET/IB: Got completion from peer %s with status=%d opcode=%d len=%u vendor err %u (%s)%s%s%s%s hca %s", ncclSocketToString(&addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err, reqTypeStr[r->type], localGidStr ? " localGid ":"", localGidString, remoteGidStr ? " remoteGids":"", remoteGidString, hcaName); return ncclRemoteError; @@ -2130,7 +2253,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN+1]; - TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%d wr_id=%ld r=%p type=%d events={%d,%d}, i=%d", + TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%u wr_id=%lu r=%p type=%d events={%d,%d}, i=%d", ncclSocketToString(&addr, line), wc->status, wc->opcode,wc->byte_len, wc->wr_id, req, req->type, req->events[0], req->events[1], i); #endif if (req && req->type == NCCL_NET_IB_REQ_SEND) { @@ -2174,7 +2297,7 @@ ncclResult_t ncclIbCloseSend(void* sendComm) { for (int q = 0; q < comm->base.nqps; q++) if (comm->base.qps[q].qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->base.qps[q].qp)); - for (int i = 0; i < comm->base.ndevs; i++) { + for (int i = 0; i < comm->base.vProps.ndevs; i++) { struct ncclIbSendCommDev* commDev = comm->devs + i; if (commDev->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(commDev->fifoMr)); if (comm->remSizesFifo.mrs[i] != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->remSizesFifo.mrs[i])); @@ -2194,7 +2317,7 @@ ncclResult_t ncclIbCloseRecv(void* recvComm) { for (int q = 0; q < comm->base.nqps; q++) if (comm->base.qps[q].qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->base.qps[q].qp)); - for (int i = 0; i < comm->base.ndevs; i++) { + for (int i = 0; i < comm->base.vProps.ndevs; i++) { struct ncclIbRecvCommDev* commDev = comm->devs + i; if (comm->flushEnabled) { if (commDev->gpuFlush.qp.qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(commDev->gpuFlush.qp.qp)); @@ -2237,6 +2360,11 @@ ncclNet_t ncclNetIb = { ncclIbCloseRecv, ncclIbCloseListen, NULL /* getDeviceMr */, - NULL /* irecvConsumed */ + NULL /* irecvConsumed */, + ncclIbMakeVDevice }; +/* + ncclIbSetProperties, + ncclIbRefreshDevices +*/ diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc index 73a5d55b0..235dee865 100644 --- a/src/transport/net_socket.cc +++ b/src/transport/net_socket.cc @@ -44,6 +44,7 @@ ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction) { ncclNetIfs = ncclFindInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS); if (ncclNetIfs <= 0) { WARN("NET/Socket : no interface found"); + pthread_mutex_unlock(&ncclNetSocketLock); return ncclInternalError; } else { #define MAX_LINE_LEN (2047) @@ -76,7 +77,7 @@ static ncclResult_t ncclNetSocketGetSpeed(char* devName, int* speed) { ncclResult_t ret = ncclSuccess; *speed = 0; char speedPath[PATH_MAX]; - sprintf(speedPath, "/sys/class/net/%s/speed", devName); + snprintf(speedPath, sizeof(speedPath), "/sys/class/net/%s/speed", devName); int fd = -1; SYSCHECKSYNC(open(speedPath, O_RDONLY), "open", fd); if (fd != -1) { @@ -102,6 +103,7 @@ ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) { props->guid = dev; props->ptrSupport = NCCL_PTR_HOST; props->regIsGlobal = 0; + props->forceFlush = 0; NCCLCHECK(ncclNetSocketGetSpeed(props->name, &props->speed)); props->latency = 0; // Not set props->port = 0; @@ -109,6 +111,7 @@ ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) { props->maxRecvs = 1; props->netDeviceType = NCCL_NET_DEVICE_HOST; props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; + props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES; return ncclSuccess; } @@ -297,6 +300,7 @@ ncclResult_t ncclNetSocketGetNsockNthread(int dev, int* ns, int* nt) { ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) { if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev + WARN("NET/Socket : ncclNetSocketListen dev=%d ncclNetIfs=%d", dev, ncclNetIfs); return ncclInternalError; } ncclResult_t ret = ncclSuccess; @@ -558,16 +562,16 @@ ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, v } ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; } -ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { +ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)sendComm; - NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclNetSocketRequest**)request)); + NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_SEND, data, (int) size, (struct ncclNetSocketRequest**)request)); return ncclSuccess; } -ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { +ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)recvComm; if (n != 1) return ncclInternalError; - NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], sizes[0], (struct ncclNetSocketRequest**)request)); + NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], (int)sizes[0], (struct ncclNetSocketRequest**)request)); return ncclSuccess; } @@ -632,5 +636,6 @@ ncclNet_t ncclNetSocket = { ncclNetSocketClose, ncclNetSocketCloseListen, NULL /* getDeviceMr */, - NULL /* irecvConsumed */ + NULL /* irecvConsumed */, + NULL /* mergeDevices */ }; diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc index aa9c486b1..582c30a35 100644 --- a/src/transport/nvls.cc +++ b/src/transport/nvls.cc @@ -108,11 +108,12 @@ ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAll return ncclSuccess; } -ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) { +ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) { CUCHECK(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, size)); CUCHECK(cuMemUnmap(ptr, size)); CUCHECK(cuMemAddressFree(ptr, size)); CUCHECK(cuMemRelease(*mcHandler)); + INFO(NCCL_NVLS, "rank %d - NVLS deregistered buffer %p on device %d, size %ld", comm->rank, (void*)ptr, dev, size); return ncclSuccess; } @@ -450,11 +451,11 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { if (comm->localRank == 0) { shmPath[0] = '\0'; - NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsResources->nvlsShmemHandle), res, fail); + NCCLCHECKGOTO(ncclShmOpen(shmPath, sizeof(shmPath), (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsResources->nvlsShmemHandle), res, fail); NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, fail); } else { NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, fail); - NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, -1, &comm->nvlsResources->nvlsShmemHandle), res, fail); + NCCLCHECKGOTO(ncclShmOpen(shmPath, sizeof(shmPath), (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, -1, &comm->nvlsResources->nvlsShmemHandle), res, fail); } /* need 2 pools and a shared counter for shmem-based collectives */ comm->nvlsResources->nvlsShmem.cnt[0] = (size_t*)nvlsShmem; @@ -495,7 +496,7 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) { return ncclSuccess; } -ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t buffSize, CUdeviceptr *regAddr, bool *regUsed) { +ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t buffSize, CUdeviceptr *regAddr, int *regUsed) { ncclResult_t ret = ncclSuccess; struct ncclReg *regRecord = NULL; CUdeviceptr regPtr = 0; @@ -601,43 +602,33 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t } *regAddr = (uintptr_t)regPtr + regData[comm->localRank].offset; - *regUsed = true; + *regUsed = 1; exit: free(regData); return ret; fail: - *regUsed = false; + *regUsed = 0; goto exit; } -ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { +static ncclResult_t nvlsRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, struct ncclReg *sendRegRecord, struct ncclReg *recvRegRecord, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { ncclResult_t ret = ncclSuccess; - bool localRegBufUsed = false; + int regBufUsed = 0; struct localRegData *regData = NULL; bool sendNeedReg = false, recvNeedReg = false; CUdeviceptr regSendPtr = 0; CUdeviceptr regRecvPtr = 0; - struct ncclReg *sendRegRecord = NULL; - struct ncclReg *recvRegRecord = NULL; - - *outRegBufUsed = false; NCCLCHECKGOTO(ncclCalloc(®Data, comm->localRanks * 2), ret, fail); - if (sendbuff) { - NCCLCHECKGOTO(ncclRegFind(comm, sendbuff, sendbuffSize, &sendRegRecord), ret, fail); - if (sendRegRecord) { - memcpy(®Data[comm->localRank * 2].reg, sendRegRecord, sizeof(struct ncclReg)); - regData[comm->localRank * 2].offset = (uintptr_t)sendbuff - sendRegRecord->addr; - } + if (sendRegRecord) { + memcpy(®Data[comm->localRank * 2].reg, sendRegRecord, sizeof(struct ncclReg)); + regData[comm->localRank * 2].offset = (uintptr_t)sendbuff - sendRegRecord->addr; } - if (recvbuff) { - NCCLCHECKGOTO(ncclRegFind(comm, recvbuff, recvbuffSize, &recvRegRecord), ret, fail); - if (recvRegRecord) { - memcpy(®Data[comm->localRank * 2 + 1].reg, recvRegRecord, sizeof(struct ncclReg)); - regData[comm->localRank * 2 + 1].offset = (uintptr_t)recvbuff - recvRegRecord->addr; - } + if (recvRegRecord) { + memcpy(®Data[comm->localRank * 2 + 1].reg, recvRegRecord, sizeof(struct ncclReg)); + regData[comm->localRank * 2 + 1].offset = (uintptr_t)recvbuff - recvRegRecord->addr; } NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regData + comm->localRank * 2, regData, sizeof(struct localRegData) * 2), ret, fail); @@ -682,229 +673,127 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send } if ((!sendNeedReg || sendbuff == NULL) && (!recvNeedReg || recvbuff == NULL)) { - localRegBufUsed = true; - INFO(NCCL_NVLS, "rank %d reuse local-registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr); + regBufUsed = 1; + INFO(NCCL_REG, "rank %d reuse registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr); goto exit; } /* Start Registration. Not found registered buffers, then check whether both send and recv buffer locate * in register request cache. */ - if (sendNeedReg && sendbuff) { - tryRegisterBuffer(comm, (uintptr_t)sendbuff, sendbuffSize, ®SendPtr, &localRegBufUsed); - if (localRegBufUsed == false) goto fail; + if (sendNeedReg && sendbuff && sendbuffSize > 0) { + tryRegisterBuffer(comm, (uintptr_t)sendbuff, sendbuffSize, ®SendPtr, ®BufUsed); + if (regBufUsed == 0) goto fail; } - if (recvNeedReg && recvbuff) { - tryRegisterBuffer(comm, (uintptr_t)recvbuff, recvbuffSize, ®RecvPtr, &localRegBufUsed); - if (localRegBufUsed == false) goto fail; + if (recvNeedReg && recvbuff && recvbuffSize > 0) { + tryRegisterBuffer(comm, (uintptr_t)recvbuff, recvbuffSize, ®RecvPtr, ®BufUsed); + if (regBufUsed == 0) goto fail; } - INFO(NCCL_NVLS, "rank %d successfully local-registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr); + INFO(NCCL_REG, "rank %d successfully registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr); exit: *outRegBufSend = (void*)regSendPtr; *outRegBufRecv = (void*)regRecvPtr; - *outRegBufUsed = localRegBufUsed; + *outRegBufUsed = regBufUsed; free(regData); return ncclSuccess; fail: - localRegBufUsed = false; + regBufUsed = 0; + WARN("rank %d failed to NVLS register sendbuff %p sendbuffSize %ld recvbuff %p recvbuffSize %ld", comm->rank, sendbuff, sendbuffSize, recvbuff, recvbuffSize); goto exit; } +ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { + struct ncclReg *sendRegRecord = NULL; + struct ncclReg *recvRegRecord = NULL; + bool sendIsValid = false; + bool recvIsValid = false; + + *outRegBufUsed = 0; + if (sendbuff) { + NCCLCHECK(ncclRegFind(comm, sendbuff, sendbuffSize, &sendRegRecord)); + NCCLCHECK(ncclRegLocalIsValid(sendRegRecord, &sendIsValid)); + } else { + sendIsValid = true; + } + if (recvbuff) { + NCCLCHECK(ncclRegFind(comm, recvbuff, recvbuffSize, &recvRegRecord)); + NCCLCHECK(ncclRegLocalIsValid(recvRegRecord, &recvIsValid)); + } else { + recvIsValid = true; + } + + if (sendIsValid && recvIsValid) + NCCLCHECK(nvlsRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, sendRegRecord, recvRegRecord, outRegBufUsed, outRegBufSend, outRegBufRecv)); + + return ncclSuccess; +} + struct ncclNvlsCleanupCallback { struct ncclCommCallback base; - CUmemGenericAllocationHandle mcHandle; - CUdeviceptr ptr; - int dev; - size_t size; + struct ncclReg *reg; + struct ncclComm *comm; }; static ncclResult_t cleanupNvls(struct ncclComm* comm, struct ncclCommCallback* cb) { struct ncclNvlsCleanupCallback* obj = (struct ncclNvlsCleanupCallback*)cb; - NCCLCHECK(ncclNvlsDeregBuffer(&obj->mcHandle, obj->ptr, obj->dev, obj->size)); - INFO(NCCL_NVLS, "rank %d - deregistered buffer %p on device %d, size %ld", comm->rank, (void*)obj->ptr, obj->dev, obj->size); + NCCLCHECK(ncclCommGraphDeregister(obj->comm, obj->reg)); free(obj); return ncclSuccess; } ncclResult_t ncclNvlsGraphRegisterBuffer( struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, - bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, + int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue* cleanupQueue, int* nCleanupQueueEltsAdded ) { - ncclResult_t ret = ncclSuccess; - bool localRegBufUsed = false; struct ncclNvlsCleanupCallback* sendRecord = NULL; struct ncclNvlsCleanupCallback* recvRecord = NULL; - CUdeviceptr regSendPtr = 0; - CUdeviceptr regRecvPtr = 0; - CUmulticastObjectProp mcprop; - CUmemAllocationProp ucprop; - char shareableHandle[NVLS_HANDLE_SIZE]; - CUmemGenericAllocationHandle sendMcHandle, recvMcHandle; - size_t sendGran = 0, recvGran = 0; - bool *regBufFlags = NULL; - struct graphRegData *rdata = NULL; - const void *baseSend = NULL; - const void *baseRecv = NULL; - size_t baseSendSize = 1; - size_t baseRecvSize = 1; - size_t ucgran; - - *outRegBufUsed = false; - NCCLCHECKGOTO(ncclCalloc(®BufFlags, comm->localRanks), ret, fail); - NCCLCHECKGOTO(ncclCalloc(&rdata, comm->localRanks), ret, fail); - - if (sendbuffSize > 0 || recvbuffSize > 0) { - /* retrieve base pointer and size */ - if (CUPFN(cuMemGetAddressRange) == nullptr) goto fail; - if (sendbuff != NULL) - CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)sendbuff), ret, fail); - if (recvbuff != NULL) - CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &baseRecvSize, (CUdeviceptr)recvbuff), ret, fail); - - memset(&ucprop, 0, sizeof(CUmemAllocationProp)); - ucprop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - ucprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - ucprop.location.id = comm->cudaDev; - ucprop.requestedHandleTypes = ncclCuMemHandleType; - CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail); - - localRegBufUsed = ((uint64_t)baseSend % ucgran != 0 || (uint64_t)baseRecv % ucgran != 0) ? false : true; - regBufFlags[comm->localRank] = localRegBufUsed; - NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, regBufFlags, sizeof(bool)), ret, fail); - for (int i = 0; i < comm->localRanks; ++i) - if (regBufFlags[i] == false) goto fail; - - memset(&mcprop, 0, sizeof(CUmulticastObjectProp)); - mcprop.numDevices = comm->localRanks; - mcprop.handleTypes = ncclCuMemHandleType; - mcprop.flags = 0; - - if (sendbuff != NULL) { - mcprop.size = baseSendSize; - CUCHECKGOTO(cuMulticastGetGranularity(&sendGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail); - - /* check send buffer offset and size */ - rdata[comm->localRank].offset = (uintptr_t)sendbuff - (uintptr_t)baseSend; - rdata[comm->localRank].size = baseSendSize; - NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, rdata, sizeof(struct graphRegData)), ret, fail); - baseSendSize = rdata[0].size; - for (int i = 1; i < comm->localRanks; ++i) { - if (rdata[0].offset != rdata[i].offset) goto fail; - if (baseSendSize > rdata[i].size) baseSendSize = rdata[i].size; - } - if (baseSendSize % sendGran != 0) goto fail; - - mcprop.size = baseSendSize; + void *baseSend = NULL; + void *baseRecv = NULL; + size_t baseSendSize = 0; + size_t baseRecvSize = 0; + struct ncclReg *sendRegRecord = NULL; + struct ncclReg *recvRegRecord = NULL; - /* register sendbuff */ - if (comm->localRank == 0) { - NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &sendMcHandle, shareableHandle), ret, fail); - NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); - } else { - NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); - NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &sendMcHandle), ret, fail); - } + *outRegBufUsed = 0; + if (sendbuff) { + CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)sendbuff)); + NCCLCHECK(ncclCommGraphRegister(comm, baseSend, baseSendSize, (void**)&sendRegRecord)); + } - CUCHECKGOTO(cuMulticastAddDevice(sendMcHandle, comm->nvlsResources->dev), ret, fail); - CUCHECKGOTO(cuMulticastBindAddr(sendMcHandle, 0, (CUdeviceptr)baseSend, baseSendSize, 0), ret, fail); + if (recvbuff) { + CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &baseRecvSize, (CUdeviceptr)recvbuff)); + NCCLCHECK(ncclCommGraphRegister(comm, baseRecv, baseRecvSize, (void**)&recvRegRecord)); + } - // Create a VA for the NVLS - CUCHECKGOTO(cuMemAddressReserve(®SendPtr, baseSendSize, sendGran, 0U, 0), ret, fail); - // Map the VA locally - CUCHECKGOTO(cuMemMap(regSendPtr, baseSendSize, 0, sendMcHandle, 0), ret, fail); - CUCHECKGOTO(cuMemSetAccess(regSendPtr, baseSendSize, &comm->nvlsResources->accessDesc, 1), ret, fail); + NCCLCHECK(nvlsRegisterBuffer(comm, baseSend, baseRecv, baseSendSize, baseRecvSize, sendRegRecord, recvRegRecord, outRegBufUsed, outRegBufSend, outRegBufRecv)); + if (*outRegBufUsed) { + if (sendRegRecord) { sendRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback)); sendRecord->base.fn = cleanupNvls; - sendRecord->mcHandle = sendMcHandle; - sendRecord->ptr = regSendPtr; - sendRecord->dev = comm->nvlsResources->dev; - sendRecord->size = baseSendSize; - } - - if (recvbuff != NULL) { - mcprop.size = baseRecvSize; - CUCHECKGOTO(cuMulticastGetGranularity(&recvGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail); - - rdata[comm->localRank].offset = (uintptr_t)recvbuff - (uintptr_t)baseRecv; - rdata[comm->localRank].size = baseRecvSize; - NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, rdata, sizeof(struct graphRegData)), ret, fail); - baseRecvSize = rdata[0].size; - for (int i = 1; i < comm->localRanks; ++i) { - if (rdata[0].offset != rdata[i].offset) goto fail; - if (baseRecvSize > rdata[i].size) baseRecvSize = rdata[i].size; - } - if (baseRecvSize % recvGran != 0) goto fail; - - mcprop.size = baseRecvSize; - if (comm->localRank == 0) { - NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &recvMcHandle, shareableHandle), ret, fail); - NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); - } else { - NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); - NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &recvMcHandle), ret, fail); - } - - CUCHECKGOTO(cuMulticastAddDevice(recvMcHandle, comm->nvlsResources->dev), ret, fail); - CUCHECKGOTO(cuMulticastBindAddr(recvMcHandle, 0, (CUdeviceptr)baseRecv, baseRecvSize, 0), ret, fail); - - // Create a VA for the NVLS - CUCHECKGOTO(cuMemAddressReserve(®RecvPtr, baseRecvSize, recvGran, 0U, 0), ret, fail); - // Map the VA locally - CUCHECKGOTO(cuMemMap(regRecvPtr, baseRecvSize, 0, recvMcHandle, 0), ret, fail); - CUCHECKGOTO(cuMemSetAccess(regRecvPtr, baseRecvSize, &comm->nvlsResources->accessDesc, 1), ret, fail); - - recvRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback)); - recvRecord->base.fn = cleanupNvls; - recvRecord->mcHandle = recvMcHandle; - recvRecord->ptr = regRecvPtr; - recvRecord->dev = comm->nvlsResources->dev; - recvRecord->size = baseRecvSize; - } - - localRegBufUsed = true; - } - -exit: - if (localRegBufUsed == false) { - if (sendRecord) { - ncclNvlsDeregBuffer(&sendRecord->mcHandle, sendRecord->ptr, sendRecord->dev, sendRecord->size); - free(sendRecord); - } - - if (recvRecord) { - // Yes, it's a dead code. That's fine... - // coverity[dead_error_begin] - ncclNvlsDeregBuffer(&recvRecord->mcHandle, recvRecord->ptr, recvRecord->dev, recvRecord->size); - free(recvRecord); - } - } else { - if (sendRecord) { - *outRegBufSend = (void*)((uintptr_t)regSendPtr + (uintptr_t)sendbuff - (uintptr_t)baseSend); + sendRecord->reg = sendRegRecord; + sendRecord->comm = comm; ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)sendRecord); *nCleanupQueueEltsAdded += 1; } - if (recvRecord) { - *outRegBufRecv = (void*)((uintptr_t)regRecvPtr + (uintptr_t)recvbuff - (uintptr_t)baseRecv); + if (recvRegRecord) { + recvRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback)); + recvRecord->base.fn = cleanupNvls; + recvRecord->reg = recvRegRecord; + recvRecord->comm = comm; ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)recvRecord); *nCleanupQueueEltsAdded += 1; } - - INFO(NCCL_NVLS, "rank %d successfully graph-registered sendbuff %p, recvbuff %p, sendbuff size %ld (register size %ld, sendGran %ld), recvbuff size %ld (register size %ld, recvGran %ld), reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, baseSendSize, sendGran, recvbuffSize, baseRecvSize, recvGran, (void*)regSendPtr, (void*)regRecvPtr); + } else { + if (sendbuff) NCCLCHECK(ncclCommGraphDeregister(comm, sendRegRecord)); + if (recvbuff) NCCLCHECK(ncclCommGraphDeregister(comm, recvRegRecord)); } - *outRegBufUsed = localRegBufUsed; - free(regBufFlags); - free(rdata); - /* always return success. */ return ncclSuccess; -fail: - localRegBufUsed = false; - goto exit; } #else @@ -936,19 +825,19 @@ ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm) { ncclResult_t ncclNvlsGraphRegisterBuffer( struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, - bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, + int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue* cleanupQueue, int* nCleanupQueueEltsAdded ) { *outRegBufUsed = false; return ncclSuccess; } -ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { +ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) { *outRegBufUsed = false; return ncclSuccess; } -ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) { +ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) { return ncclSuccess; } diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index 6569ae175..3ae514e45 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -91,6 +91,8 @@ struct p2pCuMemProxyInfo { #include +NCCL_PARAM(LegacyCudaRegister, "LEGACY_CUDA_REGISTER", 0); + /* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */ static int busIdToCudaDev(int64_t busId) { int ndev; @@ -120,21 +122,9 @@ extern int64_t ncclParamMNNVLEnable(); ncclResult_t p2pCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { initCeOperation(); - // MNNVL support - if (comm->MNNVL && info1->hostHash != info2->hostHash) { - NCCLCHECK(ncclTopoCheckMNNVL(comm->topo, info1, info2, ret)); - if (*ret) return ncclSuccess; - } - - // Rule out different nodes / isolated containers - if (info1->hostHash != info2->hostHash || info1->shmDev != info2->shmDev) { - *ret = 0; - return ncclSuccess; - } - // Check topology / p2p level. int intermediateRank; - NCCLCHECK(ncclTopoCheckP2p(comm->topo, info1->rank, info2->rank, ret, NULL, &intermediateRank)); + NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, info1->rank, info2->rank, ret, NULL, &intermediateRank)); if (*ret == 0) return ncclSuccess; if (intermediateRank != -1) { if (useMemcpy) *ret = 0; @@ -149,6 +139,12 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph return ncclSuccess; } + if (info1->hostHash != comm->peerInfo[comm->rank].hostHash || + info1->hostHash != info2->hostHash) { + // If either peer is non-local then we are done. + return ncclSuccess; + } + // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) int cudaDev1 = busIdToCudaDev(info1->busId); int cudaDev2 = busIdToCudaDev(info2->busId); @@ -313,11 +309,11 @@ NCCL_PARAM(P2pDirectDisable, "P2P_DIRECT_DISABLE", 0); #define P2P_SAME_PID(MYINFO, PEERINFO) ((MYINFO->hostHash == PEERINFO->hostHash) && (MYINFO->pidHash == PEERINFO->pidHash)) -static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) { +static ncclResult_t p2pGetInfo(struct ncclComm* comm, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) { int p2p; // Queries the topology to see if the GPUs are Ampere and // connected via NVLink, if so we enable P2P Read by default - NCCLCHECK(ncclTopoCheckP2p(topo, info1->rank, info2->rank, &p2p, read, intermediateRank)); + NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, info1->rank, info2->rank, &p2p, read, intermediateRank)); int readEnable = ncclParamP2pReadEnable(); if (readEnable != -2) *read = readEnable; @@ -367,7 +363,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; int useRead, intermediateRank; - NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank)); + NCCLCHECK(p2pGetInfo(comm, myInfo, peerInfo, &useRead, &intermediateRank)); if (useMemcpy) useRead = 0; static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); @@ -386,7 +382,6 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st info->rank = myInfo->rank; if (P2P_SAME_PID(myInfo, peerInfo) && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) { resources->type = P2P_DIRECT; - send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/direct pointer%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, useReadStr); } else { @@ -402,8 +397,8 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/IPC%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, useReadStr, useMemcpy ? "/CE" : ""); } - send->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE; } + send->conn.flags |= info->read ? NCCL_P2P_READ : NCCL_P2P_WRITE; } else { resources->type = P2P_INTERMEDIATE; info->rank = intermediateRank; @@ -437,7 +432,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; int useRead, intermediateRank; - NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank)); + NCCLCHECK(p2pGetInfo(comm, myInfo, peerInfo, &useRead, &intermediateRank)); static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; @@ -454,7 +449,6 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st info->rank = myInfo->rank; if (P2P_SAME_PID(myInfo, peerInfo) && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) { resources->type = P2P_DIRECT; - recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; } else { if (ncclCuMemEnable()) { // cuMem API support @@ -465,8 +459,8 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st // Legacy CUDA IPC resources->type = P2P_IPC; } - recv->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE; } + recv->conn.flags |= info->read ? NCCL_P2P_READ : NCCL_P2P_WRITE; } else { resources->type = P2P_INTERMEDIATE; info->rank = intermediateRank; @@ -807,9 +801,8 @@ static ncclResult_t p2pSendProxyProgress(struct ncclProxyState* proxyState, stru return ncclSuccess; } -ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut) { - ncclResult_t ret = ncclSuccess; - struct ncclReg *regRecord = NULL; +static ncclResult_t ipcRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, struct ncclReg* regRecord, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, bool* isLegacyIpc) { +ncclResult_t ret = ncclSuccess; struct ncclIpcRegInfo* newInfo = NULL; uintptr_t* peerRmtAddrs = NULL; bool legacyIpcCap = false; @@ -820,123 +813,125 @@ ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, si *regBufFlag = 0; *offsetOut = 0; *peerRmtAddrsOut = NULL; - if (comm && userbuff && buffSize > 0 && nPeers > 0) { - NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, ®Record), ret, fail); - if (regRecord) { - // buffer was registered by by users, we need to start to register or reuse it - int peerLocalRank; - for (int p = 0; p < nPeers; p++) { - int peerRank = peerRanks[p]; - peerLocalRank = comm->rankToLocalRank[peerRank]; - if (regRecord->ipcInfos[peerLocalRank]) { - // We already have IPC info for peerLocalRank, no need to register it, we can reuse it - *regBufFlag = 1; - INFO(NCCL_REG, "rank %d - IPC local reuse buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, peerRank, regRecord->ipcInfos[peerLocalRank]->impInfo.rmtRegAddr); - } else { - // Register buffer with peerLocalRank - struct ncclProxyConnector* proxyConn = NULL; - struct p2pIpcExpInfo ipcInfo; + if (isLegacyIpc) *isLegacyIpc = false; + if (regRecord) { + // buffer was registered by by users, we need to start to register or reuse it + int peerLocalRank; + for (int p = 0; p < nPeers; p++) { + int peerRank = peerRanks[p]; + peerLocalRank = comm->rankToLocalRank[peerRank]; + if (regRecord->ipcInfos[peerLocalRank]) { + // We already have IPC info for peerLocalRank, no need to register it, we can reuse it + *regBufFlag = 1; + if (isLegacyIpc) *isLegacyIpc = regRecord->ipcInfos[peerLocalRank]->impInfo.legacyIpcCap; + INFO(NCCL_REG, "rank %d - IPC reuse buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, peerRank, regRecord->ipcInfos[peerLocalRank]->impInfo.rmtRegAddr); + } else { + // Register buffer with peerLocalRank + struct ncclProxyConnector* proxyConn = NULL; + struct p2pIpcExpInfo ipcInfo; - if (baseAddr == NULL) { - CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail); - CUCHECKGOTO(cuPointerGetAttribute((void*)&legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail); - } - if (comm->gproxyConn[peerRank].initialized == false) - NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail); - proxyConn = &comm->gproxyConn[peerRank]; - - ipcInfo.legacyIpcCap = legacyIpcCap; - // Get the mem handle for that buffer. It may have been allocated through cudaMalloc in which case we'll - // get the CUDA legacy mem handle, or through cuMem*. - if (ipcInfo.legacyIpcCap) { - // legacy export - if (comm->directMode) goto fail; + if (baseAddr == NULL) { + CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail); + CUCHECKGOTO(cuPointerGetAttribute((void*)&legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail); + } + if (comm->gproxyConn[peerRank].initialized == false) + NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail); + proxyConn = &comm->gproxyConn[peerRank]; + + // Get the mem handle for that buffer. It may have been allocated through cudaMalloc in which case we'll + // get the CUDA legacy mem handle, or through cuMem*. + if (ncclCuMemEnable()) { + CUmemGenericAllocationHandle handle; + if (CUPFN(cuMemRetainAllocationHandle(&handle, baseAddr)) != CUDA_SUCCESS) { + // if cuMem* export fails, retry legacy export + if (comm->directMode || !ncclParamLegacyCudaRegister()) goto fail; CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail); - } else if (ncclCuMemEnable()) { - CUmemGenericAllocationHandle handle; - if (CUPFN(cuMemRetainAllocationHandle(&handle, baseAddr)) != CUDA_SUCCESS) { - // if cuMem* export fails, retry legacy export - if (comm->directMode) goto fail; - CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail); - ipcInfo.legacyIpcCap = true; + ipcInfo.legacyIpcCap = true; + if (isLegacyIpc) *isLegacyIpc = true; + } else { + ipcInfo.legacyIpcCap = false; + if (isLegacyIpc) *isLegacyIpc = false; + // cuMem* export to file descriptor or fabric handle + if (proxyConn->sameProcess) { + memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle)); } else { - // cuMem* export to file descriptor or fabric handle - if (proxyConn->sameProcess) { - memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle)); + if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { + int expFd = -1; + CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail); + NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail); + SYSCHECKGOTO(close(expFd), "close", ret, fail); } else { - if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { - int expFd = -1; - CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail); - NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail); - SYSCHECKGOTO(close(expFd), "close", ret, fail); - } else { - // Allow this to silently fail for cases where the user buff cannot be registered - if (CUPFN(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0)) != CUDA_SUCCESS) { - CUCHECKGOTO(cuMemRelease(handle), ret, fail); - goto fail; - } + // Allow this to silently fail for cases where the user buff cannot be registered + if (CUPFN(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0)) != CUDA_SUCCESS) { + CUCHECKGOTO(cuMemRelease(handle), ret, fail); + goto fail; } } - CUCHECKGOTO(cuMemRelease(handle), ret, fail); } - } else { - // nothing works, just return - goto fail; + CUCHECKGOTO(cuMemRelease(handle), ret, fail); } + } else if (legacyIpcCap) { + // legacy export + if (comm->directMode || !ncclParamLegacyCudaRegister()) goto fail; + CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail); + ipcInfo.legacyIpcCap = true; + if (isLegacyIpc) *isLegacyIpc = true; + } else { + // nothing works, just return + goto fail; + } - void* rmtRegAddr = NULL; - ipcInfo.size = baseSize; - ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr; - // Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side - // and get the remote register address back. - if (proxyConn) - NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail); - if (rmtRegAddr) { - NCCLCHECKGOTO(ncclCalloc(&newInfo, 1), ret, fail); - assert(regRecord->ipcInfos[peerLocalRank] == NULL); - regRecord->state |= IPC_REG_COMPLETE; - newInfo->peerRank = peerRank; - newInfo->baseAddr = baseAddr; - newInfo->impInfo.rmtRegAddr = rmtRegAddr; - newInfo->impInfo.offset = ipcInfo.offset; - newInfo->impInfo.legacyIpcCap = ipcInfo.legacyIpcCap; - newInfo->ipcProxyconn = proxyConn; - regRecord->ipcInfos[peerLocalRank] = newInfo; - if (regRecord->regIpcAddrs.hostPeerRmtAddrs == NULL) { - NCCLCHECKGOTO(ncclCalloc(®Record->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail); - } - regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr; - needUpdate = true; - *regBufFlag = 1; - INFO(NCCL_REG, "rank %d - IPC local register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr); + void* rmtRegAddr = NULL; + ipcInfo.size = baseSize; + ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr; + // Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side + // and get the remote register address back. + if (proxyConn) + NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail); + if (rmtRegAddr) { + NCCLCHECKGOTO(ncclCalloc(&newInfo, 1), ret, fail); + assert(regRecord->ipcInfos[peerLocalRank] == NULL); + regRecord->state |= IPC_REG_COMPLETE; + newInfo->peerRank = peerRank; + newInfo->baseAddr = baseAddr; + newInfo->impInfo.rmtRegAddr = rmtRegAddr; + newInfo->impInfo.offset = ipcInfo.offset; + newInfo->impInfo.legacyIpcCap = ipcInfo.legacyIpcCap; + newInfo->ipcProxyconn = proxyConn; + regRecord->ipcInfos[peerLocalRank] = newInfo; + if (regRecord->regIpcAddrs.hostPeerRmtAddrs == NULL) { + NCCLCHECKGOTO(ncclCalloc(®Record->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail); } + regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr; + needUpdate = true; + *regBufFlag = 1; + INFO(NCCL_REG, "rank %d - IPC register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr); } } + } - if (*regBufFlag) { - if (type == NCCL_IPC_COLLECTIVE) { - // for collective, store registered remote buffers into dev memory for future reference - if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL || needUpdate) { - NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail); - if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL) - NCCLCHECKGOTO(ncclCudaCallocAsync(®Record->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail); - if (needUpdate) - NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail); - NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail); - NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail); - } - peerRmtAddrs = regRecord->regIpcAddrs.devPeerRmtAddrs; - } else { - assert(nPeers == 1); - // p2p always returns remote addr here since remote buffer addr is passed in ncclDevWorkP2p struct - peerRmtAddrs = (uintptr_t*)regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank]; + if (*regBufFlag) { + if (type == NCCL_IPC_COLLECTIVE) { + // for collective, store registered remote buffers into dev memory for future reference + if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL || needUpdate) { + NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail); + if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL) + NCCLCHECKGOTO(ncclCudaCallocAsync(®Record->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail); + if (needUpdate) + NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail); } - *offsetOut = (uintptr_t)userbuff - regRecord->addr; - *peerRmtAddrsOut = peerRmtAddrs; + peerRmtAddrs = regRecord->regIpcAddrs.devPeerRmtAddrs; + } else { + assert(nPeers == 1); + // p2p always returns remote addr here since remote buffer addr is passed in ncclDevWorkP2p struct + peerRmtAddrs = (uintptr_t*)regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank]; } + *offsetOut = (uintptr_t)userbuff - regRecord->addr; + *peerRmtAddrsOut = peerRmtAddrs; } } - exit: return ret; fail: @@ -944,146 +939,81 @@ ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, si *offsetOut = 0; *peerRmtAddrsOut = NULL; if (newInfo) free(newInfo); + WARN("rank %d failed to IPC register userbuff %p buffSize %ld nPeers %d isLegacyIpc %p", comm->rank, userbuff, buffSize, nPeers, isLegacyIpc); + goto exit; +} + +ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut) { + ncclResult_t ret = ncclSuccess; + struct ncclReg *regRecord = NULL; + bool isValid = false; + + *regBufFlag = 0; + *offsetOut = 0; + *peerRmtAddrsOut = NULL; + if (comm && userbuff && buffSize > 0 && nPeers > 0) { + NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, ®Record), ret, fail); + NCCLCHECKGOTO(ncclRegLocalIsValid(regRecord, &isValid), ret, fail); + if (isValid) + NCCLCHECKGOTO(ipcRegisterBuffer(comm, userbuff, buffSize, peerRanks, nPeers, type, regRecord, regBufFlag, offsetOut, peerRmtAddrsOut, NULL), ret, fail); + } + +exit: + return ret; +fail: + *regBufFlag = 0; + *offsetOut = 0; + *peerRmtAddrsOut = NULL; goto exit; } struct ncclIpcCleanupCallback { struct ncclCommCallback base; - bool isAddrs; - union { - struct ncclIpcRegInfo regInfo; - struct ncclPeerRegIpcAddr regIpcAddrs; - }; + struct ncclComm *comm; + struct ncclReg *reg; }; static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* cb) { struct ncclIpcCleanupCallback* obj = (struct ncclIpcCleanupCallback*)cb; - if (obj->isAddrs) { - if (obj->regIpcAddrs.hostPeerRmtAddrs) - free(obj->regIpcAddrs.hostPeerRmtAddrs); - if (obj->regIpcAddrs.devPeerRmtAddrs) - NCCLCHECK(ncclCudaFree(obj->regIpcAddrs.devPeerRmtAddrs)); - } else { - NCCLCHECK(ncclIpcDeregBuffer(comm, &obj->regInfo)); - } + NCCLCHECK(ncclCommGraphDeregister(obj->comm, obj->reg)); free(obj); return ncclSuccess; } ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, void* cleanupQueuePtr, int* nCleanupQueueElts) { ncclResult_t ret = ncclSuccess; - struct ncclProxyConnector* proxyConn = NULL; - struct p2pIpcExpInfo ipcInfo; void* baseAddr; size_t baseSize; struct ncclIntruQueue* cleanupQueue = reinterpret_cast*>(cleanupQueuePtr); - uintptr_t* peerRmtAddrs = NULL; - struct ncclIpcCleanupCallback* addrsRecord = NULL; + bool isLegacyIpc = false; + struct ncclReg *regRecord = NULL; *regBufFlag = 0; - CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail); - CUCHECKGOTO(cuPointerGetAttribute((void*)&ipcInfo.legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail); - - if (type == NCCL_IPC_COLLECTIVE) { - // collective needs host memory array to hold all remote buffer addrs. - // We need to put this into graph release queue - NCCLCHECKGOTO(ncclCalloc(&addrsRecord, 1), ret, fail); - addrsRecord->base.fn = cleanupIpc; - addrsRecord->isAddrs = true; - NCCLCHECKGOTO(ncclCalloc(&addrsRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail); - } else { - assert(nPeers == 1); - // p2p does not need anything, just returning the remote buffer is enough, but for now, we register - // peer one by one so nPeers must be 1 - } - - for (int p = 0; p < nPeers; ++p) { - int peerRank = peerRanks[p]; - if (comm->gproxyConn[peerRank].initialized == false) - NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail); - proxyConn = &comm->gproxyConn[peerRank]; - // Same as local registration. Get the mem handle for that buffer. It may have been allocated through - // cudaMalloc in which case we'll get the CUDA legacy mem handle, or through cuMem*. - if (ipcInfo.legacyIpcCap) { - if (comm->directMode) goto fail; - CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail); - } else if (ncclCuMemEnable()) { - // cuMem* export - CUmemGenericAllocationHandle handle; - if (pfn_cuMemRetainAllocationHandle(&handle, baseAddr) != CUDA_SUCCESS) { - if (comm->directMode) goto fail; - CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail); - ipcInfo.legacyIpcCap = true; - } else { - if (proxyConn->sameProcess) { - memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle)); - } else { - if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { - int expFd = -1; - CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail); - if (proxyConn->sameProcess) { - ipcInfo.impFd = expFd; - } else { - NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail); - SYSCHECKGOTO(close(expFd), "close", ret, fail); - } - } else { - CUCHECKGOTO(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0), ret, fail); - } - } - CUCHECKGOTO(cuMemRelease(handle), ret, fail); - } - } else { - goto fail; - } - - void* rmtRegAddr = NULL; - ipcInfo.size = baseSize; - ipcInfo.offset = 0; - NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(struct p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail); - if (rmtRegAddr) { + *offsetOut = 0; + *peerRmtAddrsOut = NULL; + if (comm && userbuff && buffSize > 0 && nPeers > 0) { + CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail); + NCCLCHECKGOTO(ncclCommGraphRegister(comm, baseAddr, baseSize, (void**)®Record), ret, fail); + NCCLCHECKGOTO(ipcRegisterBuffer(comm, userbuff, buffSize, peerRanks, nPeers, type, regRecord, regBufFlag, offsetOut, peerRmtAddrsOut, &isLegacyIpc), ret, fail); + if (*regBufFlag) { struct ncclIpcCleanupCallback* record; NCCLCHECKGOTO(ncclCalloc(&record, 1), ret, fail); record->base.fn = cleanupIpc; - record->isAddrs = false; - record->regInfo.peerRank = peerRank; - record->regInfo.baseAddr = baseAddr; - record->regInfo.impInfo.rmtRegAddr = rmtRegAddr; - record->regInfo.impInfo.offset = 0; - record->regInfo.impInfo.legacyIpcCap = ipcInfo.legacyIpcCap; - record->regInfo.ipcProxyconn = proxyConn; - // store the remote address into host addr array - if (type == NCCL_IPC_COLLECTIVE) - addrsRecord->regIpcAddrs.hostPeerRmtAddrs[comm->rankToLocalRank[peerRank]] = (uintptr_t)rmtRegAddr; - else - peerRmtAddrs = (uintptr_t*)rmtRegAddr; - *regBufFlag = 1; - if (ipcInfo.legacyIpcCap) - ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, &record->base); - else - ncclIntruQueueEnqueue(cleanupQueue, &record->base); - if (nCleanupQueueElts) *nCleanupQueueElts += 1; - INFO(NCCL_REG, "rank %d - IPC graph register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, baseAddr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - (uintptr_t)baseAddr); + record->comm = comm; + record->reg = regRecord; + if (isLegacyIpc) { + ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, (struct ncclCommCallback*)record); + } else { + ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record); + if (nCleanupQueueElts) *nCleanupQueueElts += 1; + } + } else { + NCCLCHECKGOTO(ncclCommGraphDeregister(comm, regRecord), ret, fail); } } - if (type == NCCL_IPC_COLLECTIVE) { - // allocate the dev addr array and copy all previously stored addrs into it. - NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail); - NCCLCHECKGOTO(ncclCudaCallocAsync(&addrsRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail); - NCCLCHECKGOTO(ncclCudaMemcpyAsync(addrsRecord->regIpcAddrs.devPeerRmtAddrs, addrsRecord->regIpcAddrs.hostPeerRmtAddrs, comm->nRanks, comm->sharedRes->hostStream.cudaStream), ret, fail); - NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail); - NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail); - peerRmtAddrs = addrsRecord->regIpcAddrs.devPeerRmtAddrs; - if (ipcInfo.legacyIpcCap) - ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, &addrsRecord->base); - else - ncclIntruQueueEnqueue(cleanupQueue, &addrsRecord->base); - } - *offsetOut = (uintptr_t)userbuff - (uintptr_t)baseAddr; - *peerRmtAddrsOut = peerRmtAddrs; - exit: + // coverity[leaked_storage:FALSE] => normally, addrsRecord is added to the cleanupQueue return ret; fail: *regBufFlag = 0; diff --git a/src/transport/shm.cc b/src/transport/shm.cc index 9be95fd80..d2d6906e8 100644 --- a/src/transport/shm.cc +++ b/src/transport/shm.cc @@ -454,6 +454,7 @@ static ncclResult_t shmRecvProxyProgress(struct ncclProxyState* proxyState, stru } static ncclResult_t shmSendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + ncclResult_t result = ncclSuccess; struct shmRequest* req = (struct shmRequest*)reqBuff; /* check message size */ if (reqSize != sizeof(struct shmRequest)) return ncclInternalError; @@ -463,13 +464,18 @@ static ncclResult_t shmSendProxySetup(struct ncclProxyConnection* connection, st struct shmProxyInfo* proxyInfo; NCCLCHECK(ncclCalloc(&proxyInfo, 1)); - NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr)); + NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail); memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t)); connection->transportResources = proxyInfo; - return ncclSuccess; +exit: + return result; +fail: + free(proxyInfo); + goto exit; } static ncclResult_t shmRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + ncclResult_t result = ncclSuccess; struct shmRequest* req = (struct shmRequest*)reqBuff; /* check message size */ if (reqSize != sizeof(struct shmRequest)) return ncclInternalError; @@ -479,10 +485,14 @@ static ncclResult_t shmRecvProxySetup(struct ncclProxyConnection* connection, st struct shmProxyInfo* proxyInfo; NCCLCHECK(ncclCalloc(&proxyInfo, 1)); - NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr)); + NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail); memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t)); connection->transportResources = proxyInfo; - return ncclSuccess; +exit: + return result; +fail: + free(proxyInfo); + goto exit; } static void initCeOperation() { @@ -534,7 +544,7 @@ ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool l } else { char shmPath[SHM_PATH_MAX] = { '\0' }; desc->shmli.shmSize = size; - NCCLCHECK(ncclShmOpen(shmPath, size, hptr, dptr, 1, &desc->shmli.handle)); + NCCLCHECK(ncclShmOpen(shmPath, sizeof(shmPath), size, hptr, dptr, 1, &desc->shmli.handle)); memcpy(desc->shmli.shmSuffix, shmPath + sizeof("/dev/shm/nccl-") - 1, sizeof(desc->shmli.shmSuffix)); desc->legacy = true; INFO(NCCL_SHM, "MMAP allocated shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr); @@ -542,7 +552,7 @@ ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool l #else /* CUDART_VERSION >= 12020 */ char shmPath[SHM_PATH_MAX] = { '\0' }; desc->shmli.shmSize = size; - NCCLCHECK(ncclShmOpen(shmPath, size, hptr, dptr, 1, &desc->shmli.handle)); + NCCLCHECK(ncclShmOpen(shmPath, sizeof(shmPath), size, hptr, dptr, 1, &desc->shmli.handle)); memcpy(desc->shmli.shmSuffix, shmPath + sizeof("/dev/shm/nccl-") - 1, sizeof(desc->shmli.shmSuffix)); desc->legacy = true; INFO(NCCL_SHM, "MMAP allocated shareable host buffer %s size %zi ptr %p", shmPath, size, *hptr); @@ -618,15 +628,15 @@ ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_ INFO(NCCL_SHM, "CUMEM imported shareable host buffer from tpProxyRank %d size %zi ptr %p, granularity %ld", desc->shmci.tpProxyRank, desc->shmci.size, descOut->shmci.ptr, granularity); } else { char shmPath[SHM_PATH_MAX]; - sprintf(shmPath, "/dev/shm/nccl-%s", desc->shmli.shmSuffix); - NCCLCHECK(ncclShmOpen(shmPath, desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle)); + snprintf(shmPath, sizeof(shmPath), "/dev/shm/nccl-%s", desc->shmli.shmSuffix); + NCCLCHECK(ncclShmOpen(shmPath, sizeof(shmPath), desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle)); descOut->legacy = true; INFO(NCCL_SHM, "MMAP imported shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr); } #else /* CUDART_VERSION >= 12020 */ char shmPath[SHM_PATH_MAX]; - sprintf(shmPath, "/dev/shm/nccl-%s", desc->shmli.shmSuffix); - NCCLCHECK(ncclShmOpen(shmPath, desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle)); + snprintf(shmPath, sizeof(shmPath), "/dev/shm/nccl-%s", desc->shmli.shmSuffix); + NCCLCHECK(ncclShmOpen(shmPath, sizeof(shmPath), desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle)); descOut->legacy = true; INFO(NCCL_SHM, "MMAP imported shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr); #endif From d7ccab8b7e35eb6b0df5825c2813b2b8780b6d12 Mon Sep 17 00:00:00 2001 From: Giuseppe Congiu Date: Tue, 10 Dec 2024 06:29:57 -0800 Subject: [PATCH 03/21] Add profiler documentation Add the following files: - ext-profiler/README.md: plugin writed documentation - ext-profiler/example/README.md: example plugin user documentation --- ext-profiler/README.md | 318 +++++++++++++++++++++++++++++++++ ext-profiler/example/README.md | 239 +++++++++++++++++++++++++ 2 files changed, 557 insertions(+) create mode 100644 ext-profiler/README.md create mode 100644 ext-profiler/example/README.md diff --git a/ext-profiler/README.md b/ext-profiler/README.md new file mode 100644 index 000000000..7ef44b2fa --- /dev/null +++ b/ext-profiler/README.md @@ -0,0 +1,318 @@ +# NCCL Profiler Plugin Documentation + +This page describes the NCCL Profiler plugin API and how to implement a profiler plugin for NCCL. + +# Overview + +To allow NCCL to better integrate with DL frameworks, NCCL v2.23 introduced a profiler plugin +interface. Any NCCL user can write profiler plugins to extract performance data from NCCL and +use it for debugging and analysis. + +Similarly to other plugins (e.g., network plugin), the profiler plugins come as a shared library +called `libnccl-profiler.so`. That shared library contains one or more implementations of the +NCCL PROFILER API, in the form of versioned structs, filled with pointers to all required +functions. + +# Plugin architecture + +## Plugin name and supporting multiple profiler plugins + +When NCCL is initialized, it will look for a `libnccl-profiler.so` library and dynamically load +it, then look for symbols inside the library. + +The `NCCL_PROFILER_PLUGIN` environment variable allows multiple plugins to coexist. If set, NCCL +will look for a library with a name of `libnccl-profiler-${NCCL_PROFILER_PLUGIN}.so`. It is therefore +advised to name the library following that pattern, with a symlink pointing `libnccl-profiler.so` +to `libnccl-profiler-${NCCL_PROFILER_PLUGIN}.so`. That way, if there are multiple plugins in the +path, setting `NCCL_PROFILER_PLUGIN` will allow users to select the right plugin. Alternatively, +the user can also set `NCCL_PROFILER_PLUGIN` to the pathname of the `libnccl-profiler.so` library. + +## Struct versioning + +Once a library is found, NCCL will look for a symbol named `ncclProfiler_vX`, with `X` increasing +over time. The versioning ensures that the plugin and the NCCL core are compatible. + +Plugins are encouraged to provide multiple of those symbols, implementing multiple versions of the +NCCL PROFILER API, so that the same plugin can be compiled and support a wide range of NCCL versions. + +Conversely, and to ease transition, NCCL can choose to support different plugin versions, looking +for the latest ncclProfiler struct version, but also looking for older ones so that older plugins +would still work. + +## Headers management + +To help users build plugins effortlessly, plugins should copy the `ncclProfiler_vX` definitions +they support to their internal includes. An example is shown in `ext-profiler/example` where we +keep all headers in the `nccl/` directory and provide thin layers to implement old version on top +of newer ones. + +The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions +from old API versions. It also provides error codes in `err.h`. + +# API (v2) + +Below is the main `ncclProfiler_v2` struct. Each function is explained in later sections. + +``` +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, int* eActivationMask); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v2_t; +``` + +## Error codes + +As rule of thumb, profiler generated errors should not be propagated to NCCL and alter its normal +functioning. Nevertheless, the profiler interface returns NCCL error codes, in case any need for +them arises in the future. For now, any profiler interface call should only return `ncclSuccess`. +The only exception is `init` that can return an error so that NCCL can disable the plugin. + +## Operation overview + +NCCL will call the `init` function first for every new communicator that is initialized. The profiler +returns an opaque context handle that is used to isolate profiler instances across communicators. +Similarly, NCCL will call `finalize` to destroy the profiler context, thus freeing resources. + +The NCCL core code is instrumented with calls to `startEvent`, `stopEvent` and `recordEventState`. +These are used to start, stop and update events in the profiler, respectively. + +## API Functions + +### Initialization + +#### name + +The `name` field should point to a character string with the name of the profiler plugin. This will +be used for all logging, especially when `NCCL_DEBUG=INFO` is set. + +#### init + +As soon as NCCL finds the plugin and the correct ncclProfiler symbol, it calls its `init` function. +This allows the plugin to initialize its internal context, used during profiling of NCCL events. +If the `init` function does not return `ncclSuccess`, NCCL disables the plugin. + +#### finalize + +When the profiler is no longer needed, a call to `finalize` destroys the profiler context and frees +up resources. + +### Profiling + +#### startEvent + +When NCCL needs to start profiling a new event it calls `startEvent`. `startEvent` takes the profiler +context, previously created by `init`, an event descriptor of type `ncclProfilerEventDescr_t` and +returns an opaque profiler event handle that can be passed to other profiler functions, as discussed +later in the document. + + +The event descriptor contains all the event metadata. Every event type has its own descriptor. Below +is the `ncclProfilerEventDescr_t` struct. + +``` +typedef struct { + uint8_t type; // event type (e.g., ncclProfileGroup, ncclProfileColl, ...) + void* parentObj; // pointer to parent event used to expose the event hierarchy to the profiler + int rank; // rank that generated the event + union { + struct { // collective events metadata + const char* name; // string containing name of the communicator + uint64_t commHash; // unique hash/id for the communicator + uint64_t seqNumber; // sequence number of this collective operation in the communicator + const char* func; // string containing name of the collective + void const* sendBuff; // address of send buffer + void* recvBuff; // address of recv buffer + size_t count; // data count + int root; // root rank + const char* datatype; // string containing the name of the datatype + size_t trafficBytes; // number of transfer bytes + uint8_t nMaxChannels; // max number of channels for this collective + uint8_t nWarps; // number of GPU warps for this collective + const char* algo; // string containing name of the algorithm for this collective + const char* proto; // string containing name of the protocol for this collective + } coll; + + struct { // point-to-point events metadata + const char* name; + uint64_t commHash; + const char* func; + void* buff; + const char* datatype; + size_t count; + int peer; // peer rank for this point-to-point + } p2p; + + struct { // proxyOp events metadata + pid_t pid; // process id that generated the associated `ncclProxyOp` object + uint8_t channelId; // id of the channel used by the associated `ncclProxyOp` object + int peer; // peer rank + int nSteps; // number of network transfers/steps required by the `ncclProxyOp` + int chunkSize; // chunk size for this `ncclProxyOp` + int isSend; // set to 1 for sends and 0 for recvs + } proxyOp; + + struct { // proxyStep events metadata + int step; // individual step in `ncclProxyOp` + } proxyStep; + }; +} ncclProfilerEventDescr_v2_t; +``` + +NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`, +`ncclProfileProxyOp`, `ncclProfileProxyStep`, and `ncclProfileProxyCtrl`. + +#### stopEvent + +`stopEvent` takes the event handle returned by `startEvent` to stop the event. After the event +has been stopped the handle can no longer be used with other profiler calls. Using the event +handle after `eventStop` is undefined behavior. + +#### recordEventState + +Some events can only be started and stopped. For example, `ncclProfileGroup`, `ncclProfileColl`, +`ncclProfileP2p`, cannot be updated through calls to `recordEventState`. + +`ncclProfileProxyOp`, `ncclProfileProxyStep` and `ncclProfileProxyCtrl` can be updated through +calls to `recordEventState`. + +The state of proxy generated events can be updated, along with event attributes, using +`recordEventState`. These events can go through several states during their lifecycle. +The list of supported states for the proxy-defined events is reported below. + +``` +typedef enum { + // ncclProfileProxyOp event states + ncclProfilerProxyOpSendPosted, // state marks the posting of send buffer to GPU for given network transfer/step + ncclProfilerProxyOpSendRemFifoWait, // state marks the waiting of CTS credits from peer rank + ncclProfilerProxyOpSendTransmitted, // state marks the sending of network transfer/step to peer rank + ncclProfilerProxyOpSendDone, // state marks the ending of network transfer/step + ncclProfilerProxyOpRecvPosted, // state marks the posting of recv to network for given network transfer/step + ncclProfilerProxyOpRecvReceived, // state marks the recving of network transfer/step from peer rank + ncclProfilerProxyOpRecvTransmitted, // state marks the ending of the network transfer/step + ncclProfilerProxyOpRecvDone, // state marks the consuming of data from GPU + + // ncclProfileProxyStep event states + ncclProfilerProxyStepSendGPUWait, // state marks the waiting of send data from GPU for given network transfer/step + ncclProfilerProxyStepSendWait, // state marks the waiting of send data from network for given network transfer/step + ncclProfilerProxyStepRecvWait, // state marks the waiting of recv data from network for given network transfer/step + ncclProfilerProxyStepRecvFlushWait, // state marks the waiting of recv data flush to GPU for given network transfer/step + ncclProfilerProxyStepRecvGPUWait, // state marks the waiting of recv data consumption from GPU for given network transfer/step + + // ncclProfileProxyCtrl event states + ncclProfilerProxyCtrlIdle, // state marks proxy progress thread idle + ncclProfilerProxyCtrlActive, // state marks proxy progress thread active + ncclProfilerProxyCtrlSleep, // state marks proxy progress thread sleeping + ncclProfilerProxyCtrlWakeup, // state marks proxy progress thread waking up + ncclProfilerProxyCtrlAppend, // state marks append of new network work item begin + ncclProfilerProxyCtrlAppendEnd, // state marks append of new network work item end +} ncclProfilerEventState_v2_t; +``` + +`ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing +network requests for the GPU kernel. ProxyOp events are generated for every active channel and +provide a summary of the activity of the proxy progress thread for that channel. + +`ncclProfileProxyStep` events are generated by the proxy progress thread while it is processing +network requests for the GPU kernel. ProxyStep events describe individual network transfer in +the channel. Thus, they provide a more fine-grained view w.r.t. ProxyOp events. + +`ncclProfileProxyCtrl` events are generated by the proxy progress thread while it is not processing +network requests for the GPU kernel. This includes everything else that the proxy thread might be +doing, including appending new `ncclProxyOp` objects to the list of work elements to process. + +State transitions for the events described can also come with event attribute updates. For this +reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported below. + +``` +typedef union { + struct { // attributes to update for ncclProfileProxyOp events + size_t transSize; // data transferred thus far + int steps; // network transfer/steps processed thus far + } proxyOp; + + struct { // attributes to update for ncclProfileProxyCtrl + int appendedProxyOps; // number of appended proxy ops thus far + } proxyCtrl; +} ncclProfilerEventStateArgs_v2_t; +``` + +The example profiler in `ext-profiler/example` contains details on how to capture and use the events above. + +### Event hierarchy + +NCCL core events (reported above) are organized into a hierarchy as reported below: + +``` +Group event + | + +- Collective event + | | + | +- ProxyOp event + | | + | +- ProxyStep event + | + +- Point-to-point event + | + +- ProxyOp event + | + +- ProxyStep event + +ProxyCtrl event +``` + +# Profiler instrumentation and logging + +## Profiling of collective and p2p operations + +The NCCL code is instrumented with profiler callbacks at different levels to capture start/stop of groups, +collective and point-to-point operations, as well as proxy progress activity. Due to the asynchronous nature +of NCCL operations, events associated to collective and point-to-point operations are not easy to delimit +precisely. For example, without both proxy and/or kernel activity it is impossible for the profiler to +figure out when a collective operation completes. Therefore, `stopEvent` for collectives simply indicates to +the profiler that the collective has been enqueued. The profiler can leverage proxy event information, if +these are enabled, to estimate when the collective ends. In this case, the profiler can look at the `stopEvent` +call of the last `ncclProfileProxyOp` event to mark the completion of the associated collective event. This +can be achieved by reference counting the collective event and letting calls to `startEvent` and `stopEvent` +increment and decrement the reference counter, respectively. + +## PXN + +PXN causes some proxy operations to be processed in a remote proxy thread that differs from the one that +generated the operation. When this happens, the event hierarchy reported above breaks. Because the +profiler can use the hierarchy information, provided by NCCL in the event descriptor, to dereference the +parent event during `startEvent`, the remote proxy thread must be in the same address space of the proxy +thread originating the operation. To avoid the profiler instance in the remote proxy address space to +dereference a pointer from another address space the event descriptor includes the PID of the originator. +The profiler plugin needs to check that the originator PID matches the local PID before dereferencing the +parent event. diff --git a/ext-profiler/example/README.md b/ext-profiler/example/README.md new file mode 100644 index 000000000..d98e58f15 --- /dev/null +++ b/ext-profiler/example/README.md @@ -0,0 +1,239 @@ +# NCCL Example Profiler Plugin Usage + +This page describes how to use the NCCL example profiler plugin + +# Overview + +The example profiler plugin implements the NCCL profiler plugin API introduced in NCCL v2.23. The API +defines a set of events and data structures that NCCL uses to share event information with profiler +plugins. The user can control what events are instrumented by NCCL and when traces collected by the +profiler should be dumped through environment variables, as described in the rest of the document. +The user can also control other profiler parameters that alter its behavior. For example, users can +change the size of the event window the profiler keeps track of. + +## Building the profiler plugin + +To use the example plugin, just type `make`. You will need a NCCL build's include directory present. +You can override `NCCL_HOME` to where the NCCL installation is on your system. + +## Using the profiler plugin + +1. Add the directory of this profiler plugin to your `LD_LIBRARY_PATH` or set the `NCCL_PROFILER_PLUGIN`, + as documented in `ext-profiler/README.md`. + +2. Set `NCCL_PROFILE_EVENT_MASK` bitmask to specify the NCCL events you want to instrument. By + default, all collectives and send/recv operations will be traced. For more details about the event + representation used by the profiler refer to `ext-profiler/README.md`. + + As an example, setting: + + `NCCL_PROFILE_EVENT_MASK` to 1 (`ncclProfileGroup`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`) + + enables the profiling of the group, the collective and the proxy op events. The same events can be + expressed more concisely by setting `NCCL_PROFILE_EVENT_MASK` to 8 (`ncclProfileProxyOp`). Indeed, + in NCCL all the events above (in the event hierarchy) the one requested are also captured. The advantage + is that the profiler can easily correlate events that belong to the same NCCL operation and present + them accordingly. + +3. Set `NCCL_PROFILE_DUMP_FILE` to the name of the dump file for the collected traces. A file named + ${NCCL_PROFILE_DUMP_FILE}-hostname-tid.txt is created. Profiler traces are saved using the chrome + event format (more precisely, using asynchronous events). + +4. If you set the dump file variable, type chrome://tracing on your chromium browser search bar and + open the created dump file to visualize the traces. + +# Changing the profiler memory pool sizes + +The example profiler uses separate memory pools for different types of events. The size of these memory +pools (i.e., the # events) determines the number of events that the profiler can keep track of at the +same time. When NCCL requests a new event (e.g., collective event) to profile a `ncclAllReduce` +operation, by calling `startEvent`, the profiler searches in the collective pool for a free event. If it +finds one, it marks it as in use and returns the handle to NCCL. If the pool is completely used the +profiler returns `NULL` to NCCL and ignores all the following NCCL profiler calls for the `NULL` event +handle. When the `ncclAllReduce` has been processed, NCCL calls `stopEvent` with the previosly returned +event handle. The profiler has a total of 5 memory pools. + +The group, collective and p2p pools contain objects for the corresponding events. The `ProxyCtrl` pool +contains objects for `ProxyCtrl` events and the `ProxyDetach` pool contains objects for `ProxyOp` events +generated by remote proxies. A list of pools and their size is reported below: + +- `NCCL_PROFILE_GROUP_POOL_SIZE` (16) +- `NCCL_PROFILE_COLL_POOL_SIZE` (16) +- `NCCL_PROFILE_P2P_POOL_SIZE` (1024) +- `NCCL_PROFILE_PROXY_CTRL_POOL_SIZE` (16) +- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (128) + +Remote proxy operations are generated when PXN is in use. Refer to this article for more information +about PXN and how it works: +https://developer.nvidia.com/blog/doubling-all2all-performance-with-nvidia-collective-communication-library-2-12/ + +# Reported events + +The example profiler generates traces using the json format. An example of trace is reported below: + +``` +[ +{"name": "Group", "cat": "GROUP", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764234.611328, "args": {"groupId": 0}}, +{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764237.294922, "args": {"SeqNum": 0, "CommHash": 673864846479792718, "Rank": 1, "Count": 32768, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "LL", "nMaxChannels": 2}}, +{"name": "Recv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768464.936523, "args": {"Channel": 0, "Peer": 0, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 772020.300781}, "RECEIVED": {"step": 14, "ts": 772196.049805}, "TRANSMITTED": {"step": 14, "ts": 772197.326172}, "DONE": {"step": 14, "ts": 772201.538086}}}, +{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768465.158203, "args": {"Step": 0}}, +{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805}, +{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805, "args": {"Step": 0}}, +{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266, "args": {"Step": 0}}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805, "args": {"Step": 0}}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768568.276367}, +{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768503.604492, "args": {"Step": 1}}, +{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805}, +{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805, "args": {"Step": 1}}, +{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234, "args": {"Step": 1}}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695, "args": {"Step": 1}}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 770006.914062}, +{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768506.941406, "args": {"Step": 2}}, +{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547}, +{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547, "args": {"Step": 2}}, +{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133, "args": {"Step": 2}}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547, "args": {"Step": 2}}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771468.458008}, +{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768509.484375, "args": {"Step": 3}}, +{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000}, +{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000, "args": {"Step": 3}}, +{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023, "args": {"Step": 3}}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211, "args": {"Step": 3}}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771910.500000}, +{"name": "Send", "cat": "PROXY", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768482.878906, "args": {"Channel": 0, "Peer": 2, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 771995.675781}, "REM_FIFO_WAIT": {"step": 14, "ts": 772190.692383}, "TRANSMITTED": {"step": 14, "ts": 772191.516602}, "DONE": {"step": 14, "ts": 772208.473633}}}, +{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.019531, "args": {"Step": 0}}, +{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781}, +{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781, "args": {"Step": 0}}, +{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234}, +{"name": "SendWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234, "args": {"Step": 0}}, +{"name": "SendWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769618.889648}, +{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.083008, "args": {"Step": 1}}, +{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086}, +{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086, "args": {"Step": 1}}, +{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664}, +{"name": "SendWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664, "args": {"Step": 1}}, +{"name": "SendWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769622.517578}, +{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768507.937500, "args": {"Step": 2}}, +{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578}, +{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578, "args": {"Step": 2}}, +{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883}, +{"name": "SendWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883, "args": {"Step": 2}}, +{"name": "SendWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770013.848633}, +{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.742188, "args": {"Step": 3}}, +{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266}, +{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266, "args": {"Step": 3}}, +{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477}, +{"name": "SendWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477, "args": {"Step": 3}}, +{"name": "SendWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771469.171875}, + ... [ trace truncated for brevity ] +{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.317383}, +{"name": "Group", "cat": "GROUP", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.418945}, +{}] +``` + +Details about the fields used in the trace can be found at this link: +https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0#heading=h.yr4qxyxotyw + +The trace above is obtained by running a `ncclAllReduce` operation on 8 GPUs, communicating with each other through +the network interface. The `Group` event encloses all traces that are related to the single `ncclAllReduce` call. +(Note that for single collective invocations, where there are no explicit group calls, NCCL creates a group with only +one collective and this is what is presented in the traces above). + + +The `AllReduce` event encloses traces for the proxy operation associated to the `ncclAllReduce` operation. The `args` +field in the traces contains NCCL specific information (aside from the chrome trace event format). + +## AllReduce trace + +The `AllReduce` entry presents information about the `ncclAllReduce` operation. It contains the following info in the args field: + +- seqNum : sequential number of the collective in the communicator (every collective type has its own sequence number in the communicator) +- commHash : communicator unique identifier +- rank : NCCL rank for the ncclAllReduce +- datatype : NCCL datatype +- algorithm : algorithm used to process the ncclAllReduce +- protocol : protocol used to process the ncclAllReduce +- nMaxChannels: max number of channels used to process the ncclAllReduce + +If the proxy events are not active (e.g., the `ncclAllReduce` is intranode) the end timestamp will match the time +consumed by the CPU to launch the collective. For more details refer to `ext-profiler/README.md`, section `Profiling +of collective and p2p operations`. + +### Proxy Send +The `Send` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following +info in the args field: + +- Channel : id of the channel used by this proxy operation to send data to the peer +- Peer : peer rank +- Steps : number of network steps required to transfer transSize bytes to the peer +- ChunkSize : chunk size used by NCCL to pipeline data through the proxy thread +- transSize : bytes transferred across the channel by this proxy operation +- POSTED : struct containing the number of buffer posts to the GPU and the time stamp for the last post +- REM_FIFO_WAIT: struct containing the number of remote buffer waits and the time stamp for the last wait +- TRANSMITTED : struct containing the number of network sends and the time stamp of the last send +- DONE : struct containing the number of network sends completed and the time stamp of the last send completed + +In case of a network problem the POSTED, REM_FIFO_WAIT, TRANSMITTED and DONE might all have partially updated steps, +which could help identify at which point the network problem occurred. + +The Proxy send trace gives a summary of the proxy progress thread activity for the channel. If more details are +needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace +entries below are also reported by the profiler. + +#### Proxy SendBufferWait + +Presents, for every network step, the time the CPU proxy spends waiting for the channel staging buffer to become available. + +#### Proxy SendGPUWait + +Presents, for every network step, the time the CPU proxy spends waiting for the GPU to provide the data in the staging +buffer. + +#### Proxy SendWait + +Presents, for every network step, the time the CPU proxy spends waiting for the `isend` to complete + +### Proxy Recv + +The `Recv` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following +info in the args field: + +- Channel : id of the channel used by this proxy operation to recv data from the peer +- Peer : peer rank +- Steps : number of network steps required to transfer transSize bytes from the peer +- ChunkSize : chunk size used by NCCL to pipeline data through the proxy thread +- transSize : bytes transferred across the channel by this proxy operation +- POSTED : struct containing the number of recvs posted and the time stamp for the last recv posted +- RECEIVED : struct containing the number of recvs completed and the time stamp for the last recv completed +- TRANSMITTED: struct containing the number of recvs flushed to the GPU memory and the time stamp for the last recv flushed +- DONE : struct containing the number of flush completed and the time stamp for the last flush completed + +The Proxy Recv trace gives a summary of the proxy progress thread activity for the channel. If more details are +needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace +entries below are also reported by the profiler. + + +#### Proxy RecvBufferWait + +Presents, for every network step, the time the CPU proxy spends waiting for the staging buffer for the channel to +become available. + +#### Proxy RecvWait + +Presents, for every network step, the time the CPU proxy spends waiting for a posted `irecv` to complete + +#### Proxy RecvFlushWait + +Presents, for every network step, the time the CPU proxy spends waitng for the recv data to be flushed to the GPU + +#### Proxy RecvGPUWait + +Presents, for every network step, the time the CPU proxy spends waiting for the GPU to consume the recv data From 1672c85781ba6158d5d173d3ecac969f8796af11 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Fri, 17 Jan 2025 02:03:22 -0800 Subject: [PATCH 04/21] Fix packaging scripts. Issue #1578 --- pkg/debian/Makefile | 2 +- pkg/debian/libnccl-dev.install.in | 2 +- pkg/debian/rules | 3 +++ pkg/redhat/nccl.spec.in | 6 ++++-- pkg/txz/create_txz.sh.in | 2 +- 5 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pkg/debian/Makefile b/pkg/debian/Makefile index 0494f3e03..650ca4270 100644 --- a/pkg/debian/Makefile +++ b/pkg/debian/Makefile @@ -25,7 +25,7 @@ prep : $(DEBTARGETS) build : prep $(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR) @printf "Building Debian package\n" - (cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b) + (cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b -Zxz) mkdir -p $(PKGDIR) mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/ diff --git a/pkg/debian/libnccl-dev.install.in b/pkg/debian/libnccl-dev.install.in index 13eca26c6..45120e6de 100644 --- a/pkg/debian/libnccl-dev.install.in +++ b/pkg/debian/libnccl-dev.install.in @@ -1,4 +1,4 @@ +bin/ncclras /usr/bin include/nccl.h /usr/include -include/nccl_net.h /usr/include lib/libnccl.so /usr/lib/${pkg:MultiArch} lib/libnccl_static.a /usr/lib/${pkg:MultiArch} diff --git a/pkg/debian/rules b/pkg/debian/rules index 23b90a9e0..8005d3020 100755 --- a/pkg/debian/rules +++ b/pkg/debian/rules @@ -11,3 +11,6 @@ override_dh_auto_test: override_dh_auto_clean: # Do not make clean + +override_dh_builddeb: + dh_builddeb -- -Zxz diff --git a/pkg/redhat/nccl.spec.in b/pkg/redhat/nccl.spec.in index 8e5aed6f3..d62955592 100644 --- a/pkg/redhat/nccl.spec.in +++ b/pkg/redhat/nccl.spec.in @@ -20,6 +20,7 @@ sockets. %package devel Summary: NVIDIA Collective Communication Library (NCCL) Runtime Group: Development/Libraries +Requires: libnccl >= ${nccl:Major}.${nccl:Minor}.${nccl:Patch} %description devel NCCL development files @@ -44,9 +45,10 @@ install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUI ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major} # devel +install -m 755 -d $RPM_BUILD_ROOT/%{_bindir} install -m 755 -d $RPM_BUILD_ROOT/%{_includedir} +install -m 755 bin/ncclras $RPM_BUILD_ROOT/%{_bindir} install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir} -install -m 644 include/nccl_net.h $RPM_BUILD_ROOT/%{_includedir} ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so # static @@ -64,8 +66,8 @@ rm -rf $RPM_BUILD_ROOT %files devel %doc LICENSE.txt %defattr(-,root,root,-) +%{_bindir}/ncclras %{_includedir}/nccl.h -%{_includedir}/nccl_net.h %{_libdir}/libnccl.so %files static diff --git a/pkg/txz/create_txz.sh.in b/pkg/txz/create_txz.sh.in index deae85483..88f961325 100644 --- a/pkg/txz/create_txz.sh.in +++ b/pkg/txz/create_txz.sh.in @@ -21,4 +21,4 @@ PKG_ARCH=${pkg:Arch} NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}" -tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt +tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/bin $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt From 80f6bda4378b99d99e82b4d76a633791cc45fef0 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Mon, 27 Jan 2025 03:30:22 -0800 Subject: [PATCH 05/21] NCCL 2.25.1-1 Add Blackwell/SM100 support * Add compilation for sm100 * Add graph search speeds for Blackwell * Optimize graph search to converge on large NVLink domains * Limit NVLS heads to 32 * Increase various limits to fit large NVLink domains * Add extra checks for IMEX setup, needed for MNNVL * Increase MAXCHANNELS to 64 Extend NVTX instrumentation to track NCCL communicators * Add communicator ID to NVTX traces to allow for correlation between ranks. RAS fixes --- makefiles/common.mk | 8 +- makefiles/version.mk | 4 +- src/Makefile | 2 +- src/collectives.cc | 79 ++----- src/device/Makefile | 2 +- src/device/all_reduce.h | 2 +- src/enqueue.cc | 24 ++- src/graph/connect.cc | 7 +- src/graph/paths.cc | 35 +++- src/graph/search.cc | 45 ++-- src/graph/topo.cc | 57 +++--- src/graph/topo.h | 7 +- src/graph/tuning.cc | 21 +- src/include/alloc.h | 11 +- src/include/device.h | 6 +- src/include/enqueue.h | 2 +- src/include/graph.h | 3 +- src/include/mnnvl.h | 15 ++ src/include/nvtx.h | 85 ++++++-- src/include/nvtx3/nvToolsExtPayloadHelper.h | 6 +- src/include/nvtx_payload_schemas.h | 125 ++++++++++++ src/include/proxy.h | 2 + src/init.cc | 215 ++++++++------------ src/mnnvl.cc | 82 ++++++++ src/proxy.cc | 36 ++-- src/ras/client_support.cc | 6 +- src/ras/ras_internal.h | 2 +- src/register/coll_reg.cc | 18 +- src/transport/nvls.cc | 12 +- src/transport/p2p.cc | 14 +- 30 files changed, 603 insertions(+), 330 deletions(-) create mode 100644 src/include/mnnvl.h create mode 100644 src/include/nvtx_payload_schemas.h create mode 100644 src/mnnvl.cc diff --git a/makefiles/common.mk b/makefiles/common.mk index 82164ab5c..1b1bb8674 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -39,14 +39,20 @@ endif CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70 CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80 CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90 +CUDA13_GENCODE = -gencode=arch=compute_100,code=sm_100 \ + -gencode=arch=compute_120,code=sm_120 CUDA8_PTX = -gencode=arch=compute_61,code=compute_61 CUDA9_PTX = -gencode=arch=compute_70,code=compute_70 CUDA11_PTX = -gencode=arch=compute_80,code=compute_80 CUDA12_PTX = -gencode=arch=compute_90,code=compute_90 +CUDA13_PTX = -gencode=arch=compute_120,code=compute_120 -ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0) +ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 12; echo $$?),0) +# Include Blackwell support if we're using CUDA12.8 or above + NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX) +else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0) # Include Hopper support if we're using CUDA11.8 or above NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX) else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) diff --git a/makefiles/version.mk b/makefiles/version.mk index 252300934..b02cf909c 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 24 -NCCL_PATCH := 3 +NCCL_MINOR := 25 +NCCL_PATCH := 1 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/Makefile b/src/Makefile index 2c5d9e863..b66ebefa2 100644 --- a/src/Makefile +++ b/src/Makefile @@ -10,7 +10,7 @@ include ../makefiles/version.mk INCEXPORTS := nccl.h LIBSRCFILES := \ bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \ - init.cc init_nvtx.cc net.cc proxy.cc transport.cc \ + init.cc init_nvtx.cc net.cc proxy.cc transport.cc mnnvl.cc \ $(wildcard graph/*.cc) \ $(wildcard misc/*.cc) \ $(wildcard transport/*.cc) \ diff --git a/src/collectives.cc b/src/collectives.cc index 479d4c511..03122f8a7 100644 --- a/src/collectives.cc +++ b/src/collectives.cc @@ -8,6 +8,7 @@ #include "collectives.h" #include "enqueue.h" #include "nccl.h" +#include "nvtx_payload_schemas.h" const char* ncclFuncToString(ncclFunc_t fn) { switch (fn) { @@ -78,11 +79,8 @@ NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { // Just pass the size of one message and not the total bytes sent/received. - constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"} - }; - size_t msgsize = sendcount * ncclTypeSize(datatype); - NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize) + NVTX3_FUNC_WITH_PARAMS(AllGather, NcclNvtxParamsAllGather, + NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcount * ncclTypeSize(datatype))); struct ncclInfo info = { ncclFuncAllGather, "AllGather", sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */ @@ -94,18 +92,8 @@ NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { - struct NvtxParamsAllReduce { - size_t bytes; - ncclRedOp_t op; - }; - // Just pass the size of one message and not the total bytes sent/received. - static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, - {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, - offsetof(NvtxParamsAllReduce, op)} - }; - NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op}; - NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload) + NVTX3_FUNC_WITH_PARAMS(AllReduce, NcclNvtxParamsAllReduce, + NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), op)); struct ncclInfo info = { ncclFuncAllReduce, "AllReduce", sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */ @@ -117,16 +105,8 @@ NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream) { - struct NvtxParamsBroadcast { - size_t bytes; - int root; - }; - constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)} - }; - NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root}; - NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload) + NVTX3_FUNC_WITH_PARAMS(Broadcast, NcclNvtxParamsBroadcast, + NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root)); struct ncclInfo info = { ncclFuncBroadcast, "Broadcast", sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */ @@ -145,19 +125,8 @@ NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - struct NvtxParamsReduce { - size_t bytes; - int root; - ncclRedOp_t op; - }; - constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)}, - {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, - offsetof(NvtxParamsReduce, op)} - }; - NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op}; - NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload) + NVTX3_FUNC_WITH_PARAMS(Reduce, NcclNvtxParamsReduce, + NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root, op)); struct ncclInfo info = { ncclFuncReduce, "Reduce", sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */ @@ -169,17 +138,8 @@ NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { - struct NvtxParamsReduceScatter { - size_t bytes; - ncclRedOp_t op; - }; - constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}, - {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0, - offsetof(NvtxParamsReduceScatter, op)} - }; - NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op}; - NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload) + NVTX3_FUNC_WITH_PARAMS(ReduceScatter, NcclNvtxParamsReduceScatter, + NVTX3_PAYLOAD(comm ? comm->commHash : 0, recvcount * ncclTypeSize(datatype), op)); struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter", sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */ @@ -187,21 +147,12 @@ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recv return ncclEnqueueCheck(&info); } -struct NvtxParamsSendRecv { - size_t bytes; - int peer; -}; -constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)} -}; - NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream) { - NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer}; - NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload) + NVTX3_FUNC_WITH_PARAMS(Send, NcclNvtxParamsSendRecv, + NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), peer)); struct ncclInfo info = { ncclFuncSend, "Send", NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ @@ -213,8 +164,8 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream) { - NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer}; - NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload) + NVTX3_FUNC_WITH_PARAMS(Recv, NcclNvtxParamsSendRecv, + NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), peer)); struct ncclInfo info = { ncclFuncRecv, "Recv", NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ diff --git a/src/device/Makefile b/src/device/Makefile index 1e9311f1f..3562563fc 100644 --- a/src/device/Makefile +++ b/src/device/Makefile @@ -5,7 +5,7 @@ # SHELL := /usr/bin/env bash -MAKEFALGS += -r +MAKEFLAGS += -r .SUFFIXES: .SECONDARY: diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h index c6c131517..216159747 100644 --- a/src/device/all_reduce.h +++ b/src/device/all_reduce.h @@ -436,7 +436,7 @@ struct RunWorkCollregUsed ? 0 : min(loopCount, channelCount - elemOffset); prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0); } - } else if (tid < tidEndReduce) { + } else if (tid < tidEndReduce && nvls->headRank != -1) { // Reduce, broadcast through NVLS using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>; Primitives, /*Direct=*/1, Proto, 0> diff --git a/src/enqueue.cc b/src/enqueue.cc index 285e17f69..23f463397 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -21,19 +21,21 @@ NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0); // Returns maximum kernel stack size of all CUDA kernels -ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) { +ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize) { ncclResult_t result = ncclSuccess; + int print = 0; if (maxStackSize) *maxStackSize = 0; int carveout = ncclParamL1SharedMemoryCarveout(); + int ncclMaxSharedMem = ncclShmemDynamicSize(cudaArch); for (int k=0; k < ncclDevKernelCount; k++) { void* fn = ncclDevKernelList[k]; + cudaFuncAttributes attr = {0}; if (fn == nullptr) continue; + CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0); if (maxStackSize) { - cudaFuncAttributes attr = {0}; - CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0); if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes; ignore0:; } @@ -43,9 +45,17 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) { result, ignore1); ignore1:; } - if (ncclShmemDynamicSize(cudaArch) != 0) { + if (ncclMaxSharedMem != 0) { + int sharedMemSize = ncclMaxSharedMem; + if (sharedMemSize > (maxSharedMem-attr.sharedSizeBytes)) { + if (print++ == 0) + INFO(NCCL_INIT, "ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu", + sharedMemSize, maxSharedMem-attr.sharedSizeBytes); + // Reduce requested MaxDynamicSharedMemorySize attribute + sharedMemSize = maxSharedMem - attr.sharedSizeBytes; + } CUDACHECKGOTO(cudaFuncSetAttribute(fn, - cudaFuncAttributeMaxDynamicSharedMemorySize, ncclShmemDynamicSize(cudaArch)), + cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize), result, next_kernel); } next_kernel:; @@ -1445,7 +1455,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan NCCLCHECK(ncclCudaDriverVersion(&driverVersion)); if (driverVersion >= 11080) { int compCap = comm->compCap; - unsigned int clusterSize = (compCap == 90) ? comm->config.cgaClusterSize : 0; + unsigned int clusterSize = (compCap >= 90) ? comm->config.cgaClusterSize : 0; CUlaunchConfig launchConfig = {0}; CUlaunchAttribute launchAttrs[3]; @@ -1597,7 +1607,7 @@ static ncclResult_t updateCollCostTable( if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1 && info->func != ncclFuncAllGather) continue; if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue; /* now we only support single-node NVLS allgather and reducescatter */ - if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && comm->nNodes > 1) continue; + if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && (comm->nNodes > 1 || comm->nRanks > NCCL_MAX_NVLS_ARITY)) continue; /* Tree reduceScatter doesn't support scaling yet */ if (a == NCCL_ALGO_PAT && info->func == ncclFuncReduceScatter && (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv)) continue; diff --git a/src/graph/connect.cc b/src/graph/connect.cc index 3f639a022..64fc1c5dd 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -19,7 +19,6 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) { int rank = comm->rank; int localRanks = comm->topo->nodes[GPU].count; - int nvlsRanks = comm->MNNVL ? comm->clique.size : localRanks; int nChannels = comm->nChannels; topoRanks->nvlsHeadNum = 0; @@ -74,7 +73,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs // Get nvls heads and the number of heads. Duplicate head is not allowed. for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) { bool addHead = true; - int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * nvlsRanks; + int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * localRanks; for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) { if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) { @@ -259,8 +258,6 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead channel->nvls.out = -1; // NVLS+SHARP not yet implemented. channel->nvls.headRank = headRank; channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1; - channel->nvls.node = comm->node; - channel->nvls.nNodes = comm->nNodes; if (comm->collNetSupport && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks; } if (comm->nNodes == 1) return ncclSuccess; @@ -466,7 +463,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa } // Use 4 compute channels per search channel to reach peak BW on <8 PPN - if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) { + if (comm->minCompCap >= 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) { nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext); } diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 6e9356826..587a8b282 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -828,14 +828,37 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr return ncclSuccess; } -int ncclTopoPathAllNVLink(struct ncclTopoSystem* system) { - int minPath = PATH_DIS; +ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min) { + int minPath = PATH_SYS; for (int i=0; inodes[GPU].count; i++) { - struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[GPU]; - for (int j=0; jnodes[GPU].count; j++) { - if (i == j) continue; + struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[type]; + if (paths == NULL) continue; + for (int j=0; jnodes[type].count; j++) { + if (type == GPU && i == j) continue; minPath = std::min(minPath, paths[j].type); } } - return minPath >= PATH_PIX ? 0 : 1; + *min = minPath; + return ncclSuccess; +} + +ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max) { + int maxPath = PATH_LOC; + for (int i=0; inodes[GPU].count; i++) { + struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[type]; + if (paths == NULL) continue; + for (int j=0; jnodes[type].count; j++) { + if (type == GPU && i == j) continue; + maxPath = std::max(maxPath, paths[j].type); + } + } + *max = maxPath; + return ncclSuccess; +} + +ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink) { + int maxPath; + NCCLCHECK(ncclTopoGetGpuMaxPath(system, GPU, &maxPath)); + *allNvLink = maxPath >= PATH_PIX ? 0 : 1; + return ncclSuccess; } diff --git a/src/graph/search.cc b/src/graph/search.cc index 9b72ac160..0185b3f7b 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -937,6 +937,11 @@ float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, #define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float)) #define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float)) +float sm100SpeedArrayIntra[] = { 90.0, 80.0, 70.0, 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 19.0 }; +float sm100SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; +#define NSPEEDSINTRA_SM100 (sizeof(sm100SpeedArrayIntra)/sizeof(float)) +#define NSPEEDSINTER_SM100 (sizeof(sm100SpeedArrayInter)/sizeof(float)) + ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) { int ngpus = system->nodes[GPU].count; int crossNic = (system->nodes[NET].count > 1) && @@ -946,8 +951,20 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph graph->crossNic = crossNic == 1 ? 1 : 0; graph->bwIntra = graph->bwInter = 0; graph->latencyInter = 0; - graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL; - graph->typeInter = PATH_PIX; + int minTypeIntra = PATH_LOC, minTypeInter = PATH_PIX; + int maxTypeIntra = PATH_SYS, maxTypeInter = PATH_SYS; + if (ngpus > 1) { + NCCLCHECK(ncclTopoGetGpuMinPath(system, GPU, &minTypeIntra)); + NCCLCHECK(ncclTopoGetGpuMaxPath(system, GPU, &maxTypeIntra)); + } + if (system->nodes[NET].count > 0) { + NCCLCHECK(ncclTopoGetGpuMinPath(system, NET, &minTypeInter)); + NCCLCHECK(ncclTopoGetGpuMaxPath(system, NET, &maxTypeInter)); + maxTypeIntra = maxTypeInter; + } + + graph->typeIntra = minTypeIntra; + graph->typeInter = minTypeInter; graph->nChannels = 0; int trySameChannels = graph->pattern == NCCL_TOPO_PATTERN_NVLS ? 0 : 1; graph->sameChannels = trySameChannels; @@ -972,14 +989,14 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL)); if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess; // NVLS and COLLNET_DIRECT search must have ngpus heads at most. - if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) - graph->maxChannels = system->nodes[GPU].count; + if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) graph->maxChannels = std::min(NCCL_MAX_NVLS_ARITY, system->nodes[GPU].count); + if (graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) graph->maxChannels = std::min(NCCL_MAX_DIRECT_ARITY+1, system->nodes[GPU].count); if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE; if (system->nodes[NET].count == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) { // Force intra-node NVLS algorithm to pull evenly from all GPUs. - graph->minChannels = graph->maxChannels = system->nodes[GPU].count; + graph->minChannels = graph->maxChannels; } struct ncclTopoGraph tmpGraph; @@ -989,11 +1006,11 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph int nspeeds = 0; float* speedArray = NULL; if (system->nodes[NET].count == 0) { - nspeeds = ccMin >= 90 ? NSPEEDSINTRA_SM90 : NSPEEDSINTRA; - speedArray = ccMin >= 90 ? sm90SpeedArrayIntra : speedArrayIntra; + nspeeds = ccMin >= 100 ? NSPEEDSINTRA_SM100 : (ccMin >= 90 ? NSPEEDSINTRA_SM90 : NSPEEDSINTRA); + speedArray = ccMin >= 100 ? sm100SpeedArrayIntra : (ccMin >= 90 ? sm90SpeedArrayIntra : speedArrayIntra); } else { - nspeeds = ccMin >= 90 ? NSPEEDSINTER_SM90 : NSPEEDSINTER; - speedArray = ccMin >= 90 ? sm90SpeedArrayInter : speedArrayInter; + nspeeds = ccMin >= 100 ? NSPEEDSINTER_SM100 : (ccMin >= 90 ? NSPEEDSINTER_SM90 : NSPEEDSINTER); + speedArray = ccMin >= 100 ? sm100SpeedArrayInter : (ccMin >= 90 ? sm90SpeedArrayInter : speedArrayInter); } int pass = 1; int speedIndex = 0; @@ -1048,18 +1065,18 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph } tmpGraph.pattern = graph->pattern; - int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS; - if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) { + int maxIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : maxTypeIntra; + if (tmpGraph.typeIntra < maxIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) { tmpGraph.typeIntra += 1; goto search; } - tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL; + tmpGraph.typeIntra = minTypeIntra; - if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) { + if (system->nodes[NET].count > 0 && tmpGraph.typeInter < maxTypeInter && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) { tmpGraph.typeInter += 1; goto search; } - tmpGraph.typeInter = PATH_PIX; + tmpGraph.typeInter = minTypeInter; if (crossNic == 2 && tmpGraph.crossNic == 0 && (graph->pattern == NCCL_TOPO_PATTERN_RING || graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE)) { diff --git a/src/graph/topo.cc b/src/graph/topo.cc index d758ac989..ba82cafb7 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -1357,11 +1357,11 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy goto exit; } -ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int** locals, int* localCount, int* pathType) { +static ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, + int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType) { int minType = PATH_DIS; float maxBw = 0; int count = 0; - NCCLCHECK(ncclCalloc(locals, system->nodes[resultType].count)); struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType]; if (paths == NULL) { *localCount = 0; return ncclSuccess; } for (int i=0; inodes[resultType].count; i++) { @@ -1371,7 +1371,15 @@ ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index if (pathType) *pathType = minType; count = 0; } - if (paths[i].bw == maxBw && paths[i].type == minType) (*locals)[count++] = i; + if (paths[i].bw == maxBw && paths[i].type == minType) { + if (count == NCCL_TOPO_MAX_NODES) { + WARN("Error : ran out of room to store found nodes in ncclTopoGetLocal." + " Filled %d of type %d, starting from index %d of type %d.", + NCCL_TOPO_MAX_NODES, resultType, index, type); + return ncclInternalError; + } + locals[count++] = i; + } } *localCount = count; return ncclSuccess; @@ -1379,7 +1387,7 @@ ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count) { int localNetCount = 0, netCountByBw = 0; - int* localNets; + int localNets[NCCL_TOPO_MAX_NODES]; float totalNetBw = 0, gpuBw = 0; for (int l=0; lnodes[GPU].nodes[gpu].nlinks; l++) { @@ -1391,54 +1399,55 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c } } - NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL)); + NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNets, &localNetCount, NULL)); for (int l=0; (l < localNetCount) && (totalNetBw < gpuBw); l++, netCountByBw++) { totalNetBw += system->nodes[GPU].nodes[gpu].paths[NET][localNets[l]].bw; } *count = netCountByBw; - free(localNets); return ncclSuccess; } ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) { - ncclResult_t ret = ncclSuccess; int gpu; NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu)); - int* localNets; + + int localNets[NCCL_TOPO_MAX_NODES]; int localNetCount; - NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL)); - int* localGpus = NULL; + NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNets, &localNetCount, NULL)); + if (localNetCount==0) { + WARN("Could not find any local path from gpu %d to net.", gpu); + return ncclInternalError; + } + + int localGpus[NCCL_TOPO_MAX_NODES]; int localGpuCount; - int net; - NCCLCHECKGOTO(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL), ret, fail); - net = system->nodes[GPU].nodes[gpu].gpu.dev; + NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL)); + + int net = system->nodes[GPU].nodes[gpu].gpu.dev; if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount); net += channelId%(DIVUP(localNetCount,localGpuCount)); if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id; if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev; -exit: - free(localNets); - if (localGpus) free(localGpus); - return ret; -fail: - goto exit; + return ncclSuccess; } ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex) { ncclResult_t ret = ncclSuccess; int netIndex; NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &netIndex)); - int* localGpus = NULL; + + int localGpus[NCCL_TOPO_MAX_NODES]; int localGpuCount; + NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, localGpus, &localGpuCount, NULL)); + int foundGpu = -1; - NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL)); for (int c=0; cnodes[GPU].nodes+g; int64_t id; - NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL), ret, fail); + NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL)); if (netId == id) { foundGpu = g; goto exit; @@ -1447,8 +1456,6 @@ ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, i } exit: *gpuIndex = foundGpu; -fail: - free(localGpus); return ret; } diff --git a/src/graph/topo.h b/src/graph/topo.h index 8e7cda5b4..2be029b88 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -16,6 +16,7 @@ #define SM80_NVLINK_BW 20.0 #define SM90_NVLINK_BW 20.6 #define SM86_NVLINK_BW 12.0 +#define SM100_NVLINK_BW 40.0 #define PCI_BW 12.0 // PCI Gen3 x16 #define QPI_BW 6.0 #define AMD_BW 16.0 @@ -91,7 +92,8 @@ struct ncclTopoLink { float bw; struct ncclTopoNode* remNode; }; -#define NCCL_TOPO_MAX_LINKS 128 +// Allows for up to 32 NICs per node on GB200-NVL72 +#define NCCL_TOPO_MAX_LINKS 576 #define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES) struct ncclTopoLinkList { @@ -172,6 +174,8 @@ ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system); ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system); ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank); +ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min); +ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max); #define NCCL_TOPO_XML_MAX_NODES 256 #define NCCL_GRAPH_XML_MAX_NODES 4096 @@ -230,6 +234,7 @@ static ncclResult_t ncclTopoIdToNetDev(struct ncclTopoSystem* system, int64_t id // Returns NVLink bw in GB/s static float ncclTopoNVLinkBw(int cudaCompCap) { return + cudaCompCap >= 100 ? SM100_NVLINK_BW : cudaCompCap >= 90 ? SM90_NVLINK_BW : cudaCompCap == 86 ? SM86_NVLINK_BW : cudaCompCap >= 80 ? SM80_NVLINK_BW : diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index f5f2e1185..8da4aeb9e 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -145,28 +145,33 @@ static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = #define VOLTA_COMPCAP_IDX 0 #define AMPERE_COMPCAP_IDX 1 #define HOPPER_COMPCAP_IDX 2 +#define BLACKWELL_COMPCAP_IDX 3 // LL128 max BW per channel -static const double llMaxBws[3][3] = { +static const double llMaxBws[][3] = { /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4}, /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0}, - /* Hopper-N1/AMD-N2/AMD-N4) */ {141.0, 45.0 /*avg of ring & tree*/, 35.0} + /* Hopper-N1/AMD-N2/AMD-N4) */ {141.0, 45.0 /*avg of ring & tree*/, 35.0}, + /* Blackwell-N1/AMD-N2/AMD-N4) */ {2*141.0, 2*45.0 /*avg of ring & tree*/, 2*35.0}, }; -static const double perChMaxRingLL128Bws[3][3] = { +static const double perChMaxRingLL128Bws[][3] = { /* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0}, /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0}, /* Hopper (N1/N2/N4) */ {36.7, 36.7, 36.7}, + /* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*36.7}, }; -static const double perChMaxTreeLL128Bws[3][3] = { +static const double perChMaxTreeLL128Bws[][3] = { /* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0}, /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0}, /* Hopper (N1/N2/N4) */ {36.7, 36.7, 29.0}, + /* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*29.0}, }; -static const double perChMaxTreeBws[3][3] = { +static const double perChMaxTreeBws[][3] = { /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0}, /* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8}, /* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0}, + /* Blackwell (N1/N2/N4) */ {2*38.7, 2*41.4, 2*36.0}, }; NCCL_PARAM(PatEnable, "PAT_ENABLE", 2); @@ -207,7 +212,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom int nRanks = comm->nRanks; if (nRanks <= 1) return ncclSuccess; - int compCapIndex = minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX; + int compCapIndex = minCompCap >= 100 ? BLACKWELL_COMPCAP_IDX : (minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX); int index2 = nNodes <= 2 ? nNodes-1 : 2; // LL: for single node, we look at GPU type; for multi-node, we look at CPU type int index1 = nNodes == 1 ? compCapIndex : @@ -418,7 +423,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom for (int c=0; ctypeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= PATH_PXN)); pEnable &= (graphs[a]->typeIntra <= PATH_NVB); @@ -427,6 +432,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom case 70: pEnable &= 1; break; case 80: pEnable &= 1; break; case 90: pEnable &= !(CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2); break; + case 100: pEnable &= 1; break; + case 120: pEnable &= 1; break; default: pEnable &= 0; break; } } diff --git a/src/include/alloc.h b/src/include/alloc.h index 7744119c3..021c91f77 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -204,14 +204,13 @@ static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) { return result; } -static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) { +static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, CUmemAllocationHandleType type, size_t size) { ncclResult_t result = ncclSuccess; size_t granularity = 0; CUdevice currentDev; CUmemAllocationProp prop = {}; CUmemAccessDesc accessDesc = {}; CUmemGenericAllocationHandle handle; - CUmemAllocationHandleType type = ncclCuMemHandleType; int cudaDev; int flag = 0; CUDACHECK(cudaGetDevice(&cudaDev)); @@ -260,7 +259,7 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) { extern int ncclCuMemEnable(); -static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, size_t size) { +static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, int type, size_t size) { WARN("CUMEM not supported prior to CUDA 11.3"); return ncclInternalError; } @@ -288,7 +287,7 @@ ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, in CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (nelem > 0) { if (ncclCuMemEnable()) { - NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT()), result, finish); + NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, ncclCuMemHandleType, nelem*ncclSizeOfT()), result, finish); } else { CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT()), result, finish); } @@ -312,7 +311,7 @@ ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, in cudaStream_t stream; CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); if (ncclCuMemEnable()) { - NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT()), result, finish); + NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, ncclCuMemHandleType, nelem*ncclSizeOfT()), result, finish); } else { CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT()), result, finish); } @@ -336,7 +335,7 @@ ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); if (nelem > 0) { if (ncclCuMemEnable()) { - NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT()), result, finish); + NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, ncclCuMemHandleType, nelem*ncclSizeOfT()), result, finish); } else { CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT()), result, finish); } diff --git a/src/include/device.h b/src/include/device.h index 0c861f595..3f918ab23 100644 --- a/src/include/device.h +++ b/src/include/device.h @@ -59,8 +59,8 @@ union ncclLLFifoLine { }; #define WARP_SIZE 32 -#define MAXCHANNELS 32 -#define NCCL_MAX_LOCAL_RANKS 64 +#define MAXCHANNELS 64 +#define NCCL_MAX_LOCAL_RANKS 72 #define NCCL_MAX_NTHREADS 640 #define NCCL_MIN_NTHREADS (4*WARP_SIZE) #define NCCL_SIMPLE_MAX_NTHREADS 512 @@ -187,8 +187,6 @@ struct ncclNvls { int down; int treeUp; int treeDown[NCCL_MAX_NVLS_TREE_ARITY]; - int node; - int nNodes; }; #if __CUDA_ARCH__ >= 900 diff --git a/src/include/enqueue.h b/src/include/enqueue.h index 3eb6c0743..5337eeba9 100644 --- a/src/include/enqueue.h +++ b/src/include/enqueue.h @@ -17,7 +17,7 @@ #define NCCL_SIMPLE_ALIGNMENT (WARP_SIZE * 8LL * 16LL) #define NCCL_BYTES_ALIGNMENT 16 -ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize); +ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize); ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); ncclResult_t ncclLaunchPrepare(struct ncclComm* comm); ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); diff --git a/src/include/graph.h b/src/include/graph.h index 602cc8cd9..a22b62bb2 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -28,7 +28,8 @@ void ncclTopoFree(struct ncclTopoSystem* system); ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm); ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm); ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks); -int ncclTopoPathAllNVLink(struct ncclTopoSystem* system); +ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink); + ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm); // Query topology diff --git a/src/include/mnnvl.h b/src/include/mnnvl.h new file mode 100644 index 000000000..dedbefe43 --- /dev/null +++ b/src/include/mnnvl.h @@ -0,0 +1,15 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_MNNVL_H_ +#define NCCL_MNNVL_H_ + +#include "nccl.h" +#include "comm.h" + +ncclResult_t ncclMnnvlCheck(struct ncclComm* comm); + +#endif diff --git a/src/include/nvtx.h b/src/include/nvtx.h index 14b317fdd..5d00f0792 100644 --- a/src/include/nvtx.h +++ b/src/include/nvtx.h @@ -30,6 +30,7 @@ #define NVTX_SID_CommInitRankConfig 11 // same schema as NVTX_SID_CommInitRank #define NVTX_SID_CommInitRankScalable 12 // same schema as NVTX_SID_CommInitRank #define NVTX_SID_CommSplit 13 +#define NVTX_SID_CommFinalize 14 // Define static schema ID for the reduction operation. #define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 14 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START @@ -38,11 +39,13 @@ extern const nvtxDomainHandle_t ncclNvtxDomainHandle; struct nccl_domain{static constexpr char const* name{"NCCL"};}; +/// @brief Register an NVTX payload schema for static-size payloads. class payload_schema { public: - explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept + explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, + const uint64_t schemaId, const size_t size) noexcept { - schema_attr.name = schemaName; + schema_attr.payloadStaticSize = size; schema_attr.entries = entries; schema_attr.numEntries = numEntries; schema_attr.schemaId = schemaId; @@ -63,26 +66,84 @@ class payload_schema { NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID, - nullptr, + nullptr, /* schema name is not needed */ NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, nullptr, 0, 0, 0, 0, nullptr}; }; +// Convenience macro to give the payload parameters a scope. +#define NVTX3_PAYLOAD(...) __VA_ARGS__ + // Create NVTX push/pop range with parameters -// @param name of the operation (see `NVTX_SID_*`) -// @param N schema name -// @param S schema (entries) -// @param P payload (struct) -#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \ - static const payload_schema schema{S, std::extent::value, \ - NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \ +// @param N NCCL API name without the `nccl` prefix. +// @param T name of the used NVTX payload schema without "Schema" suffix. +// @param P payload parameters/entries +#define NVTX3_FUNC_WITH_PARAMS(N, T, P) \ + constexpr uint64_t schemaId = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \ + static const payload_schema schema{T##Schema, std::extent::value - 1, \ + schemaId, sizeof(T)}; \ static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ - nvtxPayloadData_t nvtx3_bpl__[] = { \ - {NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \ + const T _payload = {P}; \ + nvtxPayloadData_t nvtx3_bpl__[] = {{schemaId, sizeof(_payload), &_payload}}; \ ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \ ::nvtx3::v1::scoped_range_in const nvtx3_range__{nvtx3_func_attr__}; +/// @brief Creates an NVTX range with extended payload using the RAII pattern. +/// @tparam PayloadType Data type of the payload. +template +class ncclNvtxRange { + public: + explicit ncclNvtxRange(const nvtxEventAttributes_t* evtAttr) noexcept { + nvtxDomainRangePushEx(nvtx3::domain::get(), evtAttr); + } + + ~ncclNvtxRange() noexcept { + if (payloadData.payload) { + nvtxRangePopPayload(nvtx3::domain::get(), &payloadData, 1); + } else { + nvtxDomainRangePop(nvtx3::domain::get()); + } + } + + void setPayloadData(const uint64_t schemaId) noexcept + { + payloadData = {schemaId, sizeof(PayloadType), &payload}; + } + + ncclNvtxRange() = delete; + ncclNvtxRange(ncclNvtxRange const&) = default; + ncclNvtxRange& operator=(ncclNvtxRange const&) = default; + ncclNvtxRange(ncclNvtxRange&&) = default; + ncclNvtxRange& operator=(ncclNvtxRange&&) = default; + + // Holds the payload data. + PayloadType payload{}; + + private: + nvtxPayloadData_t payloadData = {NVTX_PAYLOAD_ENTRY_TYPE_INVALID, 0, NULL}; +}; + +// Create an NVTX range with the function name as the range name. Use RAII pattern. +// @param T Type ID of the NVTX payload (pointer for variable-size payloads). +#define NVTX3_RANGE(T) \ + static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ + ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \ + ncclNvtxRange nvtx3_range__{nvtx3_func_attr__.get()}; + +// Add static-size payload to the NVTX range created with `NVTX3_RANGE()`, +// which must be in this or an outer scope. +// @param N NCCL API name without the `nccl` prefix. +// @param S name of the used NVTX payload schema. +// @param P payload parameters/entries +#define NVTX3_RANGE_ADD_PAYLOAD(N, S, P) do { \ + constexpr uint64_t schema_id = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \ + static const payload_schema schema{S, std::extent::value - 1, schema_id, \ + sizeof(nvtx3_range__.payload)}; \ + nvtx3_range__.payload = {P}; \ + nvtx3_range__.setPayloadData(schema_id); \ +} while (0) + extern void initNvtxRegisteredEnums(); #endif diff --git a/src/include/nvtx3/nvToolsExtPayloadHelper.h b/src/include/nvtx3/nvToolsExtPayloadHelper.h index 304d5d6a5..0f0c87d6a 100644 --- a/src/include/nvtx3/nvToolsExtPayloadHelper.h +++ b/src/include/nvtx3/nvToolsExtPayloadHelper.h @@ -11,7 +11,7 @@ /* This is just an empty marker (for readability), which can be omitted. */ /* TODO: Fix issue with trailing comma at end of entry list. */ -#define NVTX_PAYLOAD_ENTRIES +#define NCCL_NVTX_PAYLOAD_ENTRIES /** @@ -32,7 +32,7 @@ * * Example: * NVTX_DEFINE_SCHEMA_FOR_STRUCT(your_struct, "SchemaName", - * NVTX_PAYLOAD_ENTRIES( + * NCCL_NVTX_PAYLOAD_ENTRIES( * (index, TYPE_INT, "integer value"), * (dpfloat, TYPE_DOUBLE, "fp64 value"), * (text, TYPE_CSTRING, "text", NULL, 24) @@ -80,7 +80,7 @@ * * Example: * NVTX_DEFINE_STRUCT_WITH_SCHEMA(your_struct_name, "Your schema name", - * NVTX_PAYLOAD_ENTRIES( + * NCCL_NVTX_PAYLOAD_ENTRIES( * (int, index, TYPE_INT, "integer value"), * (double, dpfloat, TYPE_DOUBLE, "fp64 value"), * (const char, (text, 24), TYPE_CSTRING, "text", NULL, 24) diff --git a/src/include/nvtx_payload_schemas.h b/src/include/nvtx_payload_schemas.h new file mode 100644 index 000000000..228a19275 --- /dev/null +++ b/src/include/nvtx_payload_schemas.h @@ -0,0 +1,125 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +/// Definitions of NVTX payload types and schemas used for the NVTX +/// instrumentation in init.cc and collectives.cc. + +#ifndef NVTX_PAYLOAD_SCHEMAS_H_ +#define NVTX_PAYLOAD_SCHEMAS_H_ + + +#include "nccl.h" +#include "nvtx3/nvToolsExtPayload.h" +#include "nvtx3/nvToolsExtPayloadHelper.h" + +/** + * \brief Define a C struct together with the matching schema entries. + * + * Does the same as `NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA`, but without creating the + * schema attributes. (Remove this helper when it is available in the NVTX headers.) + */ +#define NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(struct_id, prefix, entries) \ + _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \ + prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) + +// C strings used as NVTX payload entry names. +static constexpr char const* nccl_nvtxCommStr = "NCCL communicator ID"; +static constexpr char const* nccl_nvtxCudaDevStr = "CUDA device"; +static constexpr char const* nccl_nvtxRankStr = "Rank"; +static constexpr char const* nccl_nvtxNranksStr = "No. of ranks"; +static constexpr char const* nccl_nvtxMsgSizeStr = "Message size [bytes]"; +static constexpr char const* nccl_nvtxReductionOpStrpStr = "Reduction operation"; + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommInitAll, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, commhash, TYPE_UINT64, nccl_nvtxCommStr), + (int, ndev, TYPE_INT, "No. of devices") + ) +) + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommInitRank, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, newcomm, TYPE_UINT64, nccl_nvtxCommStr), + (int, nranks, TYPE_INT, nccl_nvtxNranksStr), + (int, myrank, TYPE_INT, nccl_nvtxRankStr), + (int, cudaDev, TYPE_INT, nccl_nvtxCudaDevStr) + ) +) +// The typedef and payload schema for ncclCommInitRank is also used for, +// ncclCommInitRankConfig, ncclCommInitRankScalable, ncclCommDestroy, and ncclCommAbort. +typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommInitRankConfig; +typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommInitRankScalable; +typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommAbort; +typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommDestroy; + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommSplit, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, newcomm, TYPE_UINT64, nccl_nvtxCommStr), + (uint64_t, parentcomm, TYPE_UINT64, "Parent NCCL communicator ID"), + (int, nranks, TYPE_INT, nccl_nvtxNranksStr), + (int, myrank, TYPE_INT, nccl_nvtxRankStr), + (int, cudaDev, TYPE_INT, nccl_nvtxCudaDevStr), + (int, color, TYPE_INT, "Color"), + (int, key, TYPE_INT, "Key") + ) +) + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommFinalize, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr) + ) +) + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllGather, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), + (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr) + ) +) + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllReduce, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), + (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr), + (ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr) + ) +) + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsBroadcast, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), + (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr), + (int, root, TYPE_INT, "Root") + ) +) + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsReduce, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), + (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr), + (int, root, TYPE_INT, "Root"), + (ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr) + ) +) + +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsReduceScatter, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), + (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr), + (ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr) + ) +) + +// Used in NCCL APIs `ncclSend` and `ncclRecv`. +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsSendRecv, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), + (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr), + (int, peer, TYPE_INT, "Peer rank") + ) +) + +#endif // end include guard diff --git a/src/include/proxy.h b/src/include/proxy.h index b6ef0fa9d..c97a4d7ce 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -363,6 +363,8 @@ ncclResult_t ncclProxyStart(struct ncclComm* comm); ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS); ncclResult_t ncclProxyCreate(struct ncclComm* comm); ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn); + +// NB: ncclProxyMsgTypeStr[] in proxy.cc needs to match enum ncclProxyMsgType { ncclProxyMsgInit = 1, ncclProxyMsgSharedInit = 2, diff --git a/src/init.cc b/src/init.cc index 5caaaae09..3e218ab07 100644 --- a/src/init.cc +++ b/src/init.cc @@ -18,6 +18,7 @@ #include "argcheck.h" #include "tuner.h" #include "ras.h" +#include "mnnvl.h" #include #include #include @@ -27,6 +28,7 @@ #include #include #include "param.h" +#include "nvtx_payload_schemas.h" #define STR2(v) #v #define STR(v) STR2(v) @@ -213,6 +215,7 @@ static ncclResult_t commFree(ncclComm_t comm) { free(comm->rankToNode); free(comm->rankToLocalRank); free(comm->collNetHeads); + free(comm->clique.ranks); if (comm->bootstrap) NCCLCHECK(bootstrapClose(comm->bootstrap)); @@ -530,6 +533,7 @@ static void showVersion() { } } +NCCL_PARAM(MNNVLUUID, "MNNVL_UUID", -1); NCCL_PARAM(MNNVLCliqueId, "MNNVL_CLIQUE_ID", -1); static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) { @@ -564,12 +568,16 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u info->fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED; (void) ncclNvmlDeviceGetGpuFabricInfoV(nvmlDev, &info->fabricInfo); if (info->fabricInfo.state != NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) { + if (ncclParamMNNVLUUID() != -1) { + ((long*)&info->fabricInfo.clusterUuid)[0] = ncclParamMNNVLUUID(); + ((long*)&info->fabricInfo.clusterUuid)[1] = ncclParamMNNVLUUID(); + } + if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId(); INFO(NCCL_INIT, "MNNVL busId 0x%lx fabric UUID %lx.%lx cliqueId 0x%x state %d healthMask 0x%x", info->busId, ((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1], info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask); } - if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId(); } return ncclSuccess; @@ -638,71 +646,6 @@ NCCL_PARAM(AllocP2pNetLLBuffers, "ALLOC_P2P_NET_LL_BUFFERS", 0); // MNNVL: Flag to indicate whether to enable Multi-Node NVLink NCCL_PARAM(MNNVLEnable, "MNNVL_ENABLE", 2); -#if CUDART_VERSION >= 11030 - -#include -#include "cudawrap.h" - -// Determine if MNNVL support is available -static int checkMNNVL(struct ncclComm* comm) { - ncclResult_t ret = ncclSuccess; - - // MNNVL requires cuMem to be enabled - if (!ncclCuMemEnable()) return 0; - - // MNNVL also requires FABRIC handle support - int cudaDev; - int flag = 0; - CUdevice currentDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - CUCHECK(cuDeviceGet(¤tDev, cudaDev)); - // Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported - (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));; - if (!flag) return 0; - // Check that all ranks have initialized the fabric fully - for (int i = 0; i < comm->nRanks; i++) { - if (comm->peerInfo[i].fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED) return 0; - } - - // Determine our MNNVL domain/clique - NCCLCHECKGOTO(ncclCalloc(&comm->clique.ranks, comm->nRanks), ret, fail); - comm->clique.id = comm->peerInfo[comm->rank].fabricInfo.cliqueId; - for (int i = 0; i < comm->nRanks; i++) { - nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[comm->rank].fabricInfo; - nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo; - // Check if the cluster UUID and cliqueId match - // A zero UUID means we don't have MNNVL fabric info - disable MNNVL - if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) goto fail; - if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) && - (fabricInfo1->cliqueId == fabricInfo2->cliqueId)) { - if (i == comm->rank) { - comm->cliqueRank = comm->clique.size; - } - comm->clique.ranks[comm->clique.size++] = i; - } - } - // Determine whether to enable MNNVL or not - comm->MNNVL = ncclParamMNNVLEnable() == 2 ? comm->clique.size > 1 : ncclParamMNNVLEnable(); - INFO(NCCL_INIT, "MNNVL %d cliqueId %x cliqueSize %d cliqueRank %d ", comm->MNNVL, comm->clique.id, comm->clique.size, comm->cliqueRank); - - if (comm->MNNVL) { - // Force the CUMEM handle type to be FABRIC for MNNVL - ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC; - } - - return comm->MNNVL; - -fail: - if (comm->clique.ranks) free(comm->clique.ranks); - return 0; -} - -#else -static int checkMNNVL(struct ncclComm* comm) { - return 0; -} -#endif - #define TIMER_INIT_TOTAL 0 #define TIMER_INIT_KERNELS 1 #define TIMER_INIT_BOOTSTRAP 2 @@ -782,12 +725,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p // AllGather1 - end timers[TIMER_INIT_ALLGATHER] = clockNano() - timers[TIMER_INIT_ALLGATHER]; - // MNNVL support - if (nNodes > 1 && !checkMNNVL(comm) && ncclParamMNNVLEnable() == 1) { - // Return an error if the user specifically requested MNNVL support - WARN("MNNVL is not supported on this system"); - ret = ncclSystemError; - goto fail; + // Check for MNNVL support + if ((nNodes > 1 && ncclParamMNNVLEnable() != 0) || ncclParamMNNVLEnable() == 1) { + NCCLCHECKGOTO(ncclMnnvlCheck(comm), ret, fail); } do { @@ -1079,7 +1019,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p comm->collNetSupport = 0; } } - comm->isAllNvlink = ncclTopoPathAllNVLink(comm->topo); + NCCLCHECK(ncclTopoPathAllNVLink(comm->topo, &comm->isAllNvlink)); comm->isOneRPN = (comm->maxLocalRanks == 1); NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail); @@ -1406,18 +1346,20 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { int cudaDev = job->cudaDev; int* parentRanks = NULL; int cudaArch; + int maxSharedMem = 0; double sum_timers = 0; uint64_t timers[TIMERS_INIT_COUNT] = {0}; unsigned long long commIdHash; timers[TIMER_INIT_TOTAL] = clockNano(); CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail); + CUDACHECKGOTO(cudaDeviceGetAttribute(&maxSharedMem, cudaDevAttrMaxSharedMemoryPerBlockOptin, cudaDev), res, fail); CUDACHECKGOTO(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev), res, fail); CUDACHECKGOTO(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev), res, fail); cudaArch = 100*archMajor + 10*archMinor; timers[TIMER_INIT_KERNELS] = clockNano(); - NCCLCHECK(ncclInitKernelsForDevice(cudaArch, &maxLocalSizeBytes)); + NCCLCHECK(ncclInitKernelsForDevice(cudaArch, maxSharedMem, &maxLocalSizeBytes)); // Set the maximum kernel stack size of all kernels to avoid // a CUDA memory reconfig on load (c.f. NVSHMEM issue) if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) { @@ -1533,18 +1475,24 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) { if (0 <= cgaClusterSizeEnv && cgaClusterSizeEnv <= NCCL_MAX_CGA_CLUSTER_SIZE) { comm->config.cgaClusterSize = cgaClusterSizeEnv; } else if (cgaClusterSizeEnv > NCCL_MAX_CGA_CLUSTER_SIZE) { - WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE); + INFO(NCCL_ENV, "NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE); comm->config.cgaClusterSize = NCCL_MAX_CGA_CLUSTER_SIZE; } minCTAsEnv = ncclParamMinCTAs(); if (minCTAsEnv != NCCL_CONFIG_UNDEF_INT) { - comm->config.minCTAs = minCTAsEnv; + if (minCTAsEnv <= 0) + INFO(NCCL_ENV, "NCCL_MIN_CTAS %d is too low, leaving it set at %d", minCTAsEnv, comm->config.minCTAs); + else + comm->config.minCTAs = minCTAsEnv; } maxCTAsEnv = ncclParamMaxCTAs(); if (maxCTAsEnv != NCCL_CONFIG_UNDEF_INT) { - comm->config.maxCTAs = maxCTAsEnv; + if (maxCTAsEnv <= 0) + INFO(NCCL_ENV, "NCCL_MAX_CTAS %d is too low, leaving it set at %d", maxCTAsEnv, comm->config.maxCTAs); + else + comm->config.maxCTAs = maxCTAsEnv; } envNetName = ncclGetEnv("NCCL_NET"); @@ -1565,22 +1513,22 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) { /* cap channels if needed */ if (comm->config.minCTAs > MAXCHANNELS) { - WARN("minCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.minCTAs, MAXCHANNELS, MAXCHANNELS); + INFO(NCCL_ENV, "minCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.minCTAs, MAXCHANNELS, MAXCHANNELS); comm->config.minCTAs = MAXCHANNELS; } if (comm->config.maxCTAs > MAXCHANNELS) { - WARN("maxCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.maxCTAs, MAXCHANNELS, MAXCHANNELS); + INFO(NCCL_ENV, "maxCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.maxCTAs, MAXCHANNELS, MAXCHANNELS); comm->config.maxCTAs = MAXCHANNELS; } if (comm->config.minCTAs > comm->config.maxCTAs) { - WARN("minCTAs %d is larger than maxCTAs %d, set both to %d", comm->config.minCTAs, comm->config.maxCTAs, comm->config.maxCTAs); + INFO(NCCL_ENV, "minCTAs %d is larger than maxCTAs %d, set both to %d", comm->config.minCTAs, comm->config.maxCTAs, comm->config.maxCTAs); comm->config.minCTAs = comm->config.maxCTAs; } if (comm->config.splitShare != 1 && comm->config.splitShare != 0) { - WARN("splitShare %d is not a valid value 0/1, set it to 0", comm->config.splitShare); + INFO(NCCL_ENV, "splitShare %d is not a valid value 0/1, set it to 0", comm->config.splitShare); comm->config.splitShare = 0; } @@ -1763,20 +1711,9 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId goto exit; } -struct NvtxParamsCommInitRank -{ - int rank; - int nranks; - int cudaDev; -}; -constexpr nvtxPayloadSchemaEntry_t CommInitRankSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0, offsetof(NvtxParamsCommInitRank, nranks)}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0, offsetof(NvtxParamsCommInitRank, cudaDev)}, -}; - NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank); ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { + NVTX3_RANGE(NcclNvtxParamsCommInitRank) // Load the CUDA driver and dlsym hooks (can fail on old drivers) (void)ncclCudaLibraryInit(); @@ -1784,10 +1721,11 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm ncclConfig_t config = NCCL_CONFIG_INITIALIZER; CUDACHECK(cudaGetDevice(&cudaDev)); - NvtxParamsCommInitRank payload{myrank, nranks, cudaDev}; - NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload) - NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, 1, &commId, myrank, cudaDev, &config, __func__)); + + NVTX3_RANGE_ADD_PAYLOAD(CommInitRank, NcclNvtxParamsCommInitRankSchema, + NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev)); + return ncclSuccess; } @@ -1799,10 +1737,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { ncclConfig_t config = NCCL_CONFIG_INITIALIZER; int oldDev = 0; - constexpr nvtxPayloadSchemaEntry_t CommInitAllSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of devices"} - }; - NVTX3_FUNC_WITH_PARAMS(CommInitAll, CommInitAllSchema, ndev) + NVTX3_RANGE(NcclNvtxParamsCommInitAll); // Load the CUDA driver and dlsym hooks (can fail on old drivers) (void)ncclCudaLibraryInit(); @@ -1840,14 +1775,17 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { ncclUniqueId uniqueId; NCCLCHECKGOTO(ncclGetUniqueId(&uniqueId), ret, fail); - NCCLCHECKGOTO(ncclGroupStart(), ret, fail); + NCCLCHECKGOTO(ncclGroupStartInternal(), ret, fail); for (int i=0; icommHash, ndev)); exit: (void)cudaSetDevice(oldDev); @@ -1873,14 +1811,14 @@ ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueI ncclResult_t ret = ncclSuccess; ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER; ncclConfig_t *internalConfigPtr = NULL; + + NVTX3_RANGE(NcclNvtxParamsCommInitRankConfig); + NCCLCHECK(ncclGroupStartInternal()); (void)ncclCudaLibraryInit(); CUDACHECK(cudaGetDevice(&cudaDev)); - NvtxParamsCommInitRank payload{myrank, nranks, cudaDev}; - NVTX3_FUNC_WITH_PARAMS(CommInitRankConfig, CommInitRankSchema, payload) - if (config == NULL) internalConfigPtr = &internalConfig; else @@ -1890,7 +1828,13 @@ ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueI exit: ncclGroupErrCheck(ret); NCCLCHECK(ncclGroupEndInternal()); - if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret); + if (newcomm && *newcomm) { + if (!(*newcomm)->config.blocking) { + (void) ncclCommGetAsyncError(*newcomm, &ret); + } + NVTX3_RANGE_ADD_PAYLOAD(CommInitRankConfig, NcclNvtxParamsCommInitRankSchema, + NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev)); + } return ret; fail: if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret); @@ -1899,6 +1843,8 @@ ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueI NCCL_API(ncclResult_t, ncclCommInitRankScalable, ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config); ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config) { + NVTX3_RANGE(NcclNvtxParamsCommInitRankScalable); + int cudaDev; ncclResult_t ret = ncclSuccess; ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER; @@ -1908,9 +1854,6 @@ ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myran (void)ncclCudaLibraryInit(); CUDACHECK(cudaGetDevice(&cudaDev)); - NvtxParamsCommInitRank payload{myrank, nranks, cudaDev}; - NVTX3_FUNC_WITH_PARAMS(CommInitRankScalable, CommInitRankSchema, payload) - if (config == NULL) internalConfigPtr = &internalConfig; else @@ -1920,7 +1863,13 @@ ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myran exit: ncclGroupErrCheck(ret); NCCLCHECK(ncclGroupEndInternal()); - if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret); + if (newcomm && *newcomm) { + if (!(*newcomm)->config.blocking) { + (void) ncclCommGetAsyncError(*newcomm, &ret); + } + NVTX3_RANGE_ADD_PAYLOAD(CommInitRankScalable, NcclNvtxParamsCommInitRankSchema, + NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev)); + } return ret; fail: if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret); @@ -1980,7 +1929,8 @@ static ncclResult_t commCleanup(ncclComm_t comm) { NCCL_API(ncclResult_t, ncclCommFinalize, ncclComm_t comm); ncclResult_t ncclCommFinalize(ncclComm_t comm) { - NVTX3_FUNC_RANGE_IN(nccl_domain); + NVTX3_RANGE(NcclNvtxParamsCommFinalize); + ncclResult_t ret = ncclSuccess; struct ncclCommFinalizeAsyncJob *job = NULL; @@ -2005,7 +1955,13 @@ ncclResult_t ncclCommFinalize(ncclComm_t comm) { exit: ncclGroupErrCheck(ret); NCCLCHECK(ncclGroupEndInternal()); - if (comm && !comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); } + if (comm) { + if (!comm->config.blocking) { + NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); + } + NVTX3_RANGE_ADD_PAYLOAD(CommFinalize, NcclNvtxParamsCommFinalizeSchema, + NVTX3_PAYLOAD(comm->commHash)); + } return ret; fail: free(job); @@ -2077,8 +2033,8 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) { struct ncclCommFinalizeAsyncJob *job = NULL; ncclResult_t res = ncclSuccess; - NvtxParamsCommInitRank payload{rank, nranks, cudaDev}; - NVTX3_FUNC_WITH_PARAMS(CommDestroy, CommInitRankSchema, payload) + NVTX3_FUNC_WITH_PARAMS(CommDestroy, NcclNvtxParamsCommInitRank, + NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev)); TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId); NCCLCHECK(ncclGroupStartInternal()); @@ -2105,8 +2061,9 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) { NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm); ncclResult_t ncclCommAbort(ncclComm_t comm) { + NVTX3_RANGE(NcclNvtxParamsCommAbort); + if (comm == NULL) { - NVTX3_FUNC_RANGE_IN(nccl_domain); return ncclSuccess; } NCCLCHECK(ncclGroupStartInternal()); @@ -2127,8 +2084,8 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) { struct ncclCommFinalizeAsyncJob *job = NULL; ncclResult_t res = ncclSuccess; - NvtxParamsCommInitRank payload{rank, nranks, cudaDev}; - NVTX3_FUNC_WITH_PARAMS(CommAbort, CommInitRankSchema, payload) + NVTX3_RANGE_ADD_PAYLOAD(CommAbort, NcclNvtxParamsCommInitRankSchema, + NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev)); TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId); @@ -2144,29 +2101,13 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) { goto exit; } -struct NvtxParamsCommSplit { - int rank; - int nranks; - int cudaDev; - int color; - int key; -}; -constexpr nvtxPayloadSchemaEntry_t CommSplitSchema[] = { - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0, offsetof(NvtxParamsCommSplit, nranks)}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0, offsetof(NvtxParamsCommSplit, cudaDev)}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "color", nullptr, 0, offsetof(NvtxParamsCommSplit, color)}, - {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "key", nullptr, 0, offsetof(NvtxParamsCommSplit, key)}, -}; - NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config); ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) { struct ncclCommInitRankAsyncJob *job = NULL; struct ncclComm* childComm = NCCL_COMM_NULL; ncclResult_t res = ncclSuccess; - NvtxParamsCommSplit payload{comm->rank, comm->nRanks, comm->cudaDev, color, key}; - NVTX3_FUNC_WITH_PARAMS(CommSplit, CommSplitSchema, payload) + NVTX3_RANGE(NcclNvtxParamsCommSplit) int oldDev; CUDACHECK(cudaGetDevice(&oldDev)); @@ -2224,6 +2165,12 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc (void)cudaSetDevice(oldDev); (void)ncclGroupErrCheck(res); NCCLCHECK(ncclGroupEndInternal()); + + if (res == ncclSuccess && *newcomm) { + NVTX3_RANGE_ADD_PAYLOAD(CommSplit, NcclNvtxParamsCommSplitSchema, + NVTX3_PAYLOAD((*newcomm)->commHash, comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, color, key)); + } + return res; fail: if (childComm) { diff --git a/src/mnnvl.cc b/src/mnnvl.cc new file mode 100644 index 000000000..07e8b21d9 --- /dev/null +++ b/src/mnnvl.cc @@ -0,0 +1,82 @@ +/************************************************************************* + * Copyright (c) 2015-2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "mnnvl.h" +#include "transport.h" +#include +#include "cudawrap.h" + +// Determine if MNNVL support is available +ncclResult_t ncclMnnvlCheck(struct ncclComm* comm) { + // MNNVL requires cuMem to be enabled + if (!ncclCuMemEnable()) return ncclSuccess; + + // MNNVL also requires FABRIC handle support + int cudaDev; + int flag = 0; + CUdevice currentDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + CUCHECK(cuDeviceGet(¤tDev, cudaDev)); + // Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported + (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev)); + if (!flag) return ncclSuccess; + // Check that all ranks have initialized the fabric fully + for (int i = 0; i < comm->nRanks; i++) { + if (comm->peerInfo[i].fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED) return ncclSuccess; + } + + // Determine our MNNVL domain/clique + NCCLCHECK(ncclCalloc(&comm->clique.ranks, comm->nRanks)); + comm->clique.id = comm->peerInfo[comm->rank].fabricInfo.cliqueId; + for (int i = 0; i < comm->nRanks; i++) { + nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[comm->rank].fabricInfo; + nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo; + // Check if the cluster UUID and cliqueId match + // A zero UUID means we don't have MNNVL fabric info - disable MNNVL + if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess; + if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) && + (fabricInfo1->cliqueId == fabricInfo2->cliqueId)) { + if (i == comm->rank) { + comm->cliqueRank = comm->clique.size; + } + comm->clique.ranks[comm->clique.size++] = i; + } + } + + // No MNNVL clique found + if (comm->clique.size <= 1) return ncclSuccess; + + // Check that FABRIC handles can be exported & imported by IMEX + { + void *ptr = NULL; + CUmemGenericAllocationHandle handle; + ncclCuDesc cuDesc; + CUresult err; + + // Allocate FABRIC handle compatible memory + ncclResult_t ret = ncclCuMemAlloc(&ptr, &handle, CU_MEM_HANDLE_TYPE_FABRIC, CUDA_IPC_MIN); + if (ret != ncclSuccess) return ncclSuccess; + err = CUPFN(cuMemExportToShareableHandle(&cuDesc, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0)); + if (err != CUDA_SUCCESS || + (err = CUPFN(cuMemImportFromShareableHandle(&handle, &cuDesc, CU_MEM_HANDLE_TYPE_FABRIC))) != CUDA_SUCCESS) { + const char *errStr; + (void) pfn_cuGetErrorString(err, &errStr); + NCCLCHECK(ncclCuMemFree(ptr)); + // Return an error if this is a MNNVL capable system but it's not working + WARN("MNNVL (cliqueSize %d) is available but not supported on this system. Check the IMEX configuration.", + comm->clique.size); + return ncclSystemError; + } + NCCLCHECK(ncclCuMemFree(ptr)); + + // Force the CUMEM handle type to be FABRIC for MNNVL + ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC; + comm->MNNVL = 1; + INFO(NCCL_INIT, "MNNVL %d cliqueId %x cliqueSize %d cliqueRank %d", + comm->MNNVL, comm->clique.id, comm->clique.size, comm->cliqueRank); + } + return ncclSuccess; +} diff --git a/src/proxy.cc b/src/proxy.cc index bd8188a37..5a83ef3eb 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -21,6 +21,8 @@ #include #include +#define NCCL_MAX_PROXY_CONNECTIONS (NCCL_MAX_LOCAL_RANKS+1) + enum { proxyRecv=0, proxySend=1 }; void* ncclProxyServiceUDS(void* _args); @@ -770,8 +772,8 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle); ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlAppend); TIME_START(2); - int freeOp[NCCL_MAX_LOCAL_RANKS]; - int freeOpEnd[NCCL_MAX_LOCAL_RANKS]; + int freeOp[NCCL_MAX_PROXY_CONNECTIONS]; + int freeOpEnd[NCCL_MAX_PROXY_CONNECTIONS]; for (int i = 0; i < proxyState->tpLocalnRanks; i++) freeOp[i] = -1; uint64_t lastOpCount = 0; @@ -1060,7 +1062,8 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in struct ncclProxyState* sharedProxyState = comm->proxyState; int tpProxyRank = comm->topParentRanks[proxyRank]; - proxyConn->sameProcess = comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0; + proxyConn->sameProcess = ((comm->peerInfo[proxyRank].hostHash == comm->peerInfo[comm->rank].hostHash) && + (comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash)) ? 1 : 0; // Keep one connection per local rank proxyConn->connection = NULL; proxyConn->tpRank = tpProxyRank; @@ -1193,7 +1196,7 @@ ncclResult_t ncclProxyClientQueryFdBlocking(struct ncclComm* comm, struct ncclPr goto exit; } -const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd" }; +const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd", "QueryFd", "Register", "Deregister" }; ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) { struct ncclSocket* sock; ncclResult_t ret = ncclSuccess; @@ -1552,18 +1555,18 @@ void* ncclProxyService(void* _args) { connectionPool.banks = 0; connectionPool.offset = NCCL_PROXY_CONN_POOL_SIZE; - struct pollfd pollfds[NCCL_MAX_LOCAL_RANKS+1]; - struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS]; - memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_LOCAL_RANKS); - for (int s=0; slistenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) { + if (ncclSocketGetFd(proxyState->listenSock, &pollfds[NCCL_MAX_PROXY_CONNECTIONS].fd) != ncclSuccess) { WARN("[Proxy Service] Get listenSock fd fails"); return NULL; }; - pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN; + pollfds[NCCL_MAX_PROXY_CONNECTIONS].events = POLLIN; int maxnpeers = 0; int npeers = 0; @@ -1577,17 +1580,19 @@ void* ncclProxyService(void* _args) { /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */ int ret; do { - ret = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : 500); + // poll all fds including the listenSock + ret = poll(pollfds, NCCL_MAX_PROXY_CONNECTIONS+1, asyncOpCount ? 0 : 500); } while (ret < 0 && errno == EINTR); if (ret < 0) { WARN("[Proxy Service] Poll failed: %s", strerror(errno)); return NULL; } - if (pollfds[NCCL_MAX_LOCAL_RANKS].revents) { + if (pollfds[NCCL_MAX_PROXY_CONNECTIONS].revents) { + // We got an event on the listenSock int s = 0; - while (s < NCCL_MAX_LOCAL_RANKS && pollfds[s].fd >= 0) s++; - if (s == NCCL_MAX_LOCAL_RANKS) { - WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_LOCAL_RANKS); + while (s < NCCL_MAX_PROXY_CONNECTIONS && pollfds[s].fd >= 0) s++; + if (s == NCCL_MAX_PROXY_CONNECTIONS) { + WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_PROXY_CONNECTIONS); return NULL; } if (maxnpeers < s+1) maxnpeers = s+1; @@ -1819,6 +1824,7 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) { if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) { if (*comm->abortFlag == 0 && sharedProxyState->peerAddresses) { + // We need to send a ncclProxyMsgStop message to our own proxy struct ncclSocket sock; int type = ncclProxyMsgStop; NCCLCHECK(ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag)); diff --git a/src/ras/client_support.cc b/src/ras/client_support.cc index 414a1ed94..3e4e9a504 100644 --- a/src/ras/client_support.cc +++ b/src/ras/client_support.cc @@ -80,7 +80,7 @@ static int rasOutBufferSize = 0; // We use them all over the place; no point in wasting the stack... static char lineBuf[1024]; // Temporary buffer used for printing at most 10 (RAS_CLIENT_DETAIL_THRESHOLD) rank numbers - // or for printing the local GPU devices, which can't be more than 64 (NCCL_MAX_LOCAL_RANKS) + // or for printing the local GPU devices, which can't be more than 64 // small numbers (times two if the NVML mask is different than the CUDA mask). // Still, 1024 should normally be plenty (verbose output may make things more difficult, // but we do check for overflows, so it will just be trimmed). @@ -1687,7 +1687,7 @@ static int rasCommRanksCollOpCompare(const void* p1, const void* p2) { const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, size_t size) { bool first = true; buf[0] = '\0'; - for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; i++) + for (int i = 0; i < sizeof(cudaDevs)*8; i++) if (cudaDevs & (1UL << i)) { snprintf(buf+strlen(buf), size-strlen(buf), "%s%d", (first ? "" : ","), i); first = false; @@ -1695,7 +1695,7 @@ const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, if (cudaDevs != nvmlDevs) { snprintf(buf+strlen(buf), size-strlen(buf), " (NVML "); first = true; - for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; i++) + for (int i = 0; i < sizeof(nvmlDevs)*8; i++) if (nvmlDevs & (1UL << i)) { snprintf(buf+strlen(buf), size-strlen(buf), "%s%d", (first ? "" : ","), i); first = false; diff --git a/src/ras/ras_internal.h b/src/ras/ras_internal.h index 68cac0b44..715fff4a4 100644 --- a/src/ras/ras_internal.h +++ b/src/ras/ras_internal.h @@ -78,7 +78,7 @@ struct rasCollResponse { struct rasPeerInfo { union ncclSocketAddress addr; pid_t pid; - uint64_t cudaDevs; // Bitmask. Conveniently, NCCL_MAX_LOCAL_RANKS == 64. + uint64_t cudaDevs; // Bitmask. This is for local devices so 64 bits is enough. uint64_t nvmlDevs; // Same, but not affected by CUDA_VISIBLE_DEVICES. }; diff --git a/src/register/coll_reg.cc b/src/register/coll_reg.cc index 4282dc9c8..2ab7e9448 100644 --- a/src/register/coll_reg.cc +++ b/src/register/coll_reg.cc @@ -73,15 +73,19 @@ ncclResult_t ncclRegisterCollNvlsBuffers( if (nvlsReged) { *regNeedConnect = 0; - /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to - * saturate bandwidth. */ + /* tweak NVLS channels usage; for registered NVLS buffer to saturate bandwidth. */ if (comm->nNodes == 1) { - if (info->func == ncclFuncReduceScatter) - info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5)); - else - info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4)); + if (info->func == ncclFuncReduceScatter) { + // RS: Further tweaks for Blackwell with NVLS registered buffers + info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 6 : 5)); + } + else { + // AR/AG: Further tweaks for Blackwell with NVLS registered buffers + info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 8 : 4)); + } } else { - info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6)); + // Further tweaks for Blackwell with NVLS registered buffers + info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 7 : 6)); } info->regBufType |= NCCL_NVLS_REG_BUFFER; } diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc index 582c30a35..3fe25a324 100644 --- a/src/transport/nvls.cc +++ b/src/transport/nvls.cc @@ -141,9 +141,11 @@ ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t size, void* ucptr, #include "channel.h" #define NVLS_MEM_ALIGN_SIZE (1 << 21) +#define NVLS_NCHANNELS_SM90 16 +#define NVLS_NCHANNELS_SM100 32 NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 2); -NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", 16); +NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", -2); NCCL_PARAM(NvlsChunkSize, "NVLS_CHUNKSIZE", 128*1024); ncclResult_t ncclNvlsInit(struct ncclComm* comm) { @@ -152,7 +154,7 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) { int gpuCount; NCCLCHECK(ncclTopoGetGpuCount(comm->topo, &gpuCount)); - if (!ncclParamNvlsEnable() || ((!comm->MNNVL && gpuCount <= 2) || (comm->MNNVL && comm->clique.size <= 2))) return ncclSuccess; + if (!ncclParamNvlsEnable() || gpuCount <= 2) return ncclSuccess; CUdevice dev; int driverVersion; @@ -170,7 +172,11 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) { } INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev); - if (comm->nvlsSupport == 1) comm->nvlsChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (int)ncclParamNvlsChannels())); + if (comm->nvlsSupport) { + int channels = (comm->compCap >= 100) ? NVLS_NCHANNELS_SM100 : NVLS_NCHANNELS_SM90; + if (ncclParamNvlsChannels() >= 0) channels = ncclParamNvlsChannels(); + comm->nvlsChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, channels)); + } return ncclSuccess; } diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index 3ae514e45..dac762157 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -213,7 +213,7 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int refcount, ncclIpcDe // cuMem API support CUmemGenericAllocationHandle handle; - NCCLCHECK(ncclCuMemAlloc(ptr, &handle, size)); + NCCLCHECK(ncclCuMemAlloc(ptr, &handle, type, size)); if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { // Return the native cuMem handle for later Export/Import via UDS memcpy(&ipcDesc->cuDesc.data, &handle, sizeof(handle)); @@ -816,7 +816,7 @@ ncclResult_t ret = ncclSuccess; if (isLegacyIpc) *isLegacyIpc = false; if (regRecord) { // buffer was registered by by users, we need to start to register or reuse it - int peerLocalRank; + int peerLocalRank = -1; for (int p = 0; p < nPeers; p++) { int peerRank = peerRanks[p]; peerLocalRank = comm->rankToLocalRank[peerRank]; @@ -886,8 +886,10 @@ ncclResult_t ret = ncclSuccess; ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr; // Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side // and get the remote register address back. - if (proxyConn) + if (proxyConn) { + INFO(NCCL_REG, "rank %d - IPC registering buffer %p size %ld (baseAddr %p size %ld) to peer %d", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank); NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail); + } if (rmtRegAddr) { NCCLCHECKGOTO(ncclCalloc(&newInfo, 1), ret, fail); assert(regRecord->ipcInfos[peerLocalRank] == NULL); @@ -905,7 +907,7 @@ ncclResult_t ret = ncclSuccess; regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr; needUpdate = true; *regBufFlag = 1; - INFO(NCCL_REG, "rank %d - IPC register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr); + INFO(NCCL_REG, "rank %d - IPC registered buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr); } } } @@ -1039,6 +1041,8 @@ static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, str assert(sizeof(struct p2pIpcExpInfo) == reqSize); assert(sizeof(void*) == respSize); + INFO(NCCL_REG, "Proxy rank %d register reqBuff %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, reqBuff, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess); + // request peer passes all necessary buffer info to import. The proxy thread would register // the buffer locally and return register addr back if (ipcExpInfo->legacyIpcCap) { @@ -1070,7 +1074,7 @@ static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, str CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)regAddr, ipcExpInfo->size, &accessDesc, 1), ret, fail); regAddr = (void*)((uintptr_t)regAddr + ipcExpInfo->offset); } - INFO(NCCL_REG, "Proxy rank %d register succeeds, regAddr %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, regAddr, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess); + INFO(NCCL_REG, "Proxy rank %d register success regAddr %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, regAddr, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess); exit: memcpy(respBuff, (void*)®Addr, sizeof(void*)); From f44ac759fee12ecb3cc6891e9e739a000f66fd70 Mon Sep 17 00:00:00 2001 From: Kamil Iskra Date: Wed, 12 Mar 2025 13:46:21 -0700 Subject: [PATCH 06/21] NCCL 2.26.2-1 Profiler improvements * Add events for CUDA kernel start and end. * Allow network plugins to generate profiling events * Enable profiling on a per-operation basis, rather than per-communicator. * Add support for graph capturing. Add implicit launch order * Allow to prevent deadlocks when using multiple NCCL communicators per device by implicitly ordering NCCL operations using the host program order. Disabled by default, set NCCL_LAUNCH_ORDER_IMPLICIT=1 to enable. * Add a complementary mechanism to detect host threads racing to launch to the same device. Enabled by default, set NCCL_LAUNCH_RACE_FATAL=0 to disable. Optimize the PAT algorithm * Separate the computation and execution of PAT steps on different warps, allowing to run up to 16 PAT steps in parallel to significantly accelerate PAT and reduce its linear part. Add support for setting QoS per communicator * Add a new trafficClass field to the communicator configuration, to allow the application to select a particular traffic class for a given communicator. The meaning of the traffic class is network-specific and should be set in accordance with the network configuration. * For the IB/RoCE plugin, existing config variables such as NCCL_IB_SL and NCCL_IB_TC take precedence. Allow to enable GPU Direct RDMA specifically on C2C platforms * Disabled by default, set NCCL_NET_GDR_C2C=1 to enable. Do not disable user buffer registration unless PXN is really used * Only disable UB when a communicator has more than one rank per node on any node. RAS subsystem improvements * Report operation counts separately for each collective operation type. * Provide details about missing communicator ranks and reliably distinguish ranks that are no longer a given communicator's members (now reported as NOCOMM) from those that failed to respond. Add support for timestamps to NCCL diagnostic messages * On by default for WARN messages; NCCL_DEBUG_TIMESTAMP_LEVELS can be used to enable them for other debug levels as well. * The format can be changed using the NCCL_DEBUG_TIMESTAMP_FORMAT config variable. Reduce the memory usage with NVLink SHARP (NVLS) * Potentially save hundreds of MBs of device memory, considering the multicast buffer size granularity separately from the address alignment. Update performance tuning for recent Intel CPUs * Improve algorithm/protocol selection on recent CPUs such as Emerald Rapids and Sapphire Rapids. Improve channel scheduling when mixing LL and Simple operations. * Make LL operations account for 4x more traffic to ensure LL and simple operations complete at the same time. Refactor the plugin code * Clean up and harmonize the support code across the network, tuner, and profiler plugins. Add support for comment lines (starting with #) in the nccl.conf file * Issue #1540. Make user buffer registration problems print an INFO instead of a WARN. Drop support for network plugin interface version 5. Fix a race condition with split-shared communicators * NCCL could hang during connection setup if multiple communicators were grouped together that share resources. Fix a performance regression when using NCCL_CROSS_NIC=1 * NCCL would unnecessarily alternate rings, breaking the GPU-NIC associations. Make GID index detection code more resilient * Dynamic GID detection code was giving up too soon if the detected index was not available (e.g., wasn't mapped to the container's sysfs). * Issues #1538, #1573. Fix a race condition with non-blocking operation * Fix issue when creating a non-blocking communicator after a non- blocking collective operation on another communicator. Fix shared memory usage on recent Blackwell GPUs. * Issues NVIDIA/nccl-tests#287, NVIDIA/nccl-tests#291, #1637. Fix an error with NIC fusion and IB SHARP when recreating communicators * Disable the unloading of network plugins Make the auto-merge failures in the NIC fusion non-fatal * This could happen when trying to merge IB and RoCE devices. Fixes to ncclCommAbort * Fix hangs due to the progress thread spinning indefinitely on the network progress. * Reduce the abort time by up to two orders of magnitude. Fix a crash when libnccl.so was dynamically unloaded * The RAS subsystem was missing a clean-up handler. Fix a hang if the network plugin's test() call returns an error. Fix a hang on heterogeneous architectures * Ensure we harmonize the tuning to avoid different tuning choices, causing a hang. Fix double-free on failed ncclCommInitRank and ncclCommFinalize. Fix a potential list traversal bug during a group launch of multiple communicators * Issue #1599. Unify the handling of NCCL configuration variables * Under rare circumstances, some variables specified in the config file could be ignored. --- ext-net/README.md | 30 +- ext-net/example/nccl/net.h | 13 +- ext-net/example/nccl/net_device.h | 3 +- ext-net/example/nccl/net_v10.h | 101 ++ ext-net/example/nccl/net_v2.h | 4 +- ext-net/example/nccl/net_v3.h | 4 +- ext-net/example/nccl/net_v4.h | 4 +- ext-net/example/nccl/net_v5.h | 4 +- ext-net/example/nccl/net_v6.h | 6 +- ext-net/example/nccl/net_v7.h | 6 +- ext-net/example/nccl/net_v8.h | 6 +- ext-net/example/nccl/net_v9.h | 12 +- ext-net/example/plugin.c | 77 +- makefiles/common.mk | 5 + makefiles/version.mk | 4 +- src/Makefile | 11 +- src/bootstrap.cc | 16 +- src/channel.cc | 37 +- src/debug.cc | 159 ++- src/device/all_gather.h | 64 +- src/device/all_reduce.h | 10 +- src/device/broadcast.h | 2 +- src/device/common.h | 37 +- src/device/primitives.h | 16 +- src/device/prims_ll.h | 17 +- src/device/prims_ll128.h | 17 +- src/device/prims_simple.h | 367 +++--- src/device/reduce_scatter.h | 61 +- src/device/sendrecv.h | 2 +- src/enqueue.cc | 236 ++-- src/graph/connect.cc | 2 +- src/graph/paths.cc | 82 +- src/graph/search.cc | 68 +- src/graph/topo.cc | 116 +- src/graph/topo.h | 38 +- src/graph/tuning.cc | 3 +- src/group.cc | 68 +- src/include/bitops.h | 53 +- src/include/collectives.h | 446 +++---- src/include/comm.h | 10 +- src/include/device.h | 8 +- src/include/graph.h | 16 +- src/include/group.h | 6 + src/include/nccl_net.h | 604 ---------- src/include/nccl_profiler.h | 235 ---- src/include/nccl_tuner.h | 149 --- src/include/net.h | 1 - src/include/net_device.h | 3 +- src/include/nvtx.h | 3 +- src/include/plugin/nccl_net.h | 54 + src/include/plugin/nccl_profiler.h | 69 ++ src/include/plugin/nccl_tuner.h | 22 + src/include/plugin/net/net_v10.h | 158 +++ src/include/plugin/net/net_v6.h | 113 ++ src/include/plugin/net/net_v7.h | 120 ++ src/include/plugin/net/net_v8.h | 134 +++ src/include/plugin/net/net_v9.h | 152 +++ src/include/plugin/plugin.h | 18 + src/include/plugin/profiler/net_ib.h | 13 + src/include/plugin/profiler/net_ib_v1.h | 34 + src/include/plugin/profiler/net_socket.h | 13 + src/include/plugin/profiler/net_socket_v1.h | 32 + src/include/plugin/profiler/profiler_v1.h | 107 ++ src/include/plugin/profiler/profiler_v2.h | 104 ++ src/include/plugin/profiler/profiler_v3.h | 112 ++ src/include/plugin/tuner/tuner_v2.h | 53 + src/include/plugin/tuner/tuner_v3.h | 55 + src/include/plugin/tuner/tuner_v4.h | 56 + src/include/profiler.h | 20 + src/include/proxy.h | 7 +- src/include/ras.h | 2 + src/include/register.h | 2 +- src/include/shm.h | 5 +- src/include/socket.h | 2 +- src/include/strongstream.h | 98 +- src/include/transport.h | 10 +- src/init.cc | 123 +- src/misc/ipcsocket.cc | 3 +- src/misc/param.cc | 1 + src/misc/socket.cc | 11 +- src/misc/strongstream.cc | 481 ++++---- src/misc/tuner.cc | 267 ----- src/nccl.h.in | 4 +- src/net.cc | 1033 ----------------- src/plugin/net.cc | 319 +++++ src/plugin/net/net_v10.cc | 32 + src/plugin/net/net_v6.cc | 178 +++ src/plugin/net/net_v7.cc | 174 +++ src/plugin/net/net_v8.cc | 196 ++++ src/plugin/net/net_v9.cc | 121 ++ src/plugin/plugin_open.cc | 134 +++ src/{misc => plugin}/profiler.cc | 426 +++---- src/plugin/profiler/profiler_v1.cc | 133 +++ src/plugin/profiler/profiler_v2.cc | 45 + src/plugin/profiler/profiler_v3.cc | 20 + src/plugin/tuner.cc | 99 ++ src/plugin/tuner/tuner_v2.cc | 66 ++ src/plugin/tuner/tuner_v3.cc | 38 + src/plugin/tuner/tuner_v4.cc | 22 + src/proxy.cc | 69 +- src/ras/client_support.cc | 851 ++++++++------ src/ras/collectives.cc | 716 ++++++++---- src/ras/peers.cc | 194 ++-- src/ras/ras.cc | 182 +-- src/ras/ras_internal.h | 139 ++- src/ras/rasnet.cc | 1156 +++++++++++-------- src/register/register.cc | 4 +- src/transport.cc | 18 +- src/transport/coll_net.cc | 104 +- src/transport/net.cc | 78 +- src/transport/net_ib.cc | 186 ++- src/transport/net_socket.cc | 73 +- src/transport/nvls.cc | 147 +-- src/transport/p2p.cc | 23 +- src/transport/profiler.cc | 55 + src/transport/shm.cc | 24 +- 116 files changed, 7498 insertions(+), 5254 deletions(-) create mode 100644 ext-net/example/nccl/net_v10.h delete mode 100644 src/include/nccl_net.h delete mode 100644 src/include/nccl_profiler.h delete mode 100644 src/include/nccl_tuner.h create mode 100644 src/include/plugin/nccl_net.h create mode 100644 src/include/plugin/nccl_profiler.h create mode 100644 src/include/plugin/nccl_tuner.h create mode 100644 src/include/plugin/net/net_v10.h create mode 100644 src/include/plugin/net/net_v6.h create mode 100644 src/include/plugin/net/net_v7.h create mode 100644 src/include/plugin/net/net_v8.h create mode 100644 src/include/plugin/net/net_v9.h create mode 100644 src/include/plugin/plugin.h create mode 100644 src/include/plugin/profiler/net_ib.h create mode 100644 src/include/plugin/profiler/net_ib_v1.h create mode 100644 src/include/plugin/profiler/net_socket.h create mode 100644 src/include/plugin/profiler/net_socket_v1.h create mode 100644 src/include/plugin/profiler/profiler_v1.h create mode 100644 src/include/plugin/profiler/profiler_v2.h create mode 100644 src/include/plugin/profiler/profiler_v3.h create mode 100644 src/include/plugin/tuner/tuner_v2.h create mode 100644 src/include/plugin/tuner/tuner_v3.h create mode 100644 src/include/plugin/tuner/tuner_v4.h delete mode 100644 src/misc/tuner.cc delete mode 100644 src/net.cc create mode 100644 src/plugin/net.cc create mode 100644 src/plugin/net/net_v10.cc create mode 100644 src/plugin/net/net_v6.cc create mode 100644 src/plugin/net/net_v7.cc create mode 100644 src/plugin/net/net_v8.cc create mode 100644 src/plugin/net/net_v9.cc create mode 100644 src/plugin/plugin_open.cc rename src/{misc => plugin}/profiler.cc (57%) create mode 100644 src/plugin/profiler/profiler_v1.cc create mode 100644 src/plugin/profiler/profiler_v2.cc create mode 100644 src/plugin/profiler/profiler_v3.cc create mode 100644 src/plugin/tuner.cc create mode 100644 src/plugin/tuner/tuner_v2.cc create mode 100644 src/plugin/tuner/tuner_v3.cc create mode 100644 src/plugin/tuner/tuner_v4.cc create mode 100644 src/transport/profiler.cc diff --git a/ext-net/README.md b/ext-net/README.md index aa1a3945e..90fe89bf5 100644 --- a/ext-net/README.md +++ b/ext-net/README.md @@ -60,20 +60,20 @@ of newer ones. The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions from old API versions. It also provides error codes in `err.h`. -# API (v9) +# API (v10) -Below is the main `ncclNet_v9` struct. Each function is explained in later sections. +Below is the main `ncclNet_v10` struct. Each function is explained in later sections. ``` typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); + ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props); + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. @@ -83,13 +83,13 @@ typedef struct { // should return successfully with sendComm == NULL with the expectation that // it will be called again until sendComm != NULL. // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection - ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm); + ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm); // Finalize connection establishment after remote peer has called connect. // This call must not block for the connection to be established, and instead // should return successfully with recvComm == NULL with the expectation that // it will be called again until recvComm != NULL. // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection - ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm); + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); @@ -98,10 +98,10 @@ typedef struct { ncclResult_t (*deregMr)(void* comm, void* mhandle); // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request); + ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request); // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request); + ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is // visible to the GPU ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); @@ -200,6 +200,9 @@ the plugin code adding the following definitions: #define INFO(FLAGS, ...) logFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) ``` +The `ncclProfilerCallback_t` argument is a NCCL core callback that allows the plugin to define and +record its own events with the NCCL profiler plugin. + `devices` Once the plugin is initialized, NCCL will query the number of devices available. It should not @@ -301,6 +304,11 @@ the `listen` call previously. If the sender did not connect yet, `accept` should should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it succeeds. +The `connect` API takes a `ncclNetCommConfig_t`, which contains a trafficClass field. +This field can be used by the network plugin to specify the QoS level of the connection. By default, +`trafficClass` is set to -1 but can be configured by the application during communicator initialization +to select a plugin-supported QoS level. + `closeListen`/`closeSend`/`closeRecv` Once a `listenComm`/`sendComm`/`recvComm` is no longer needed, NCCL will call @@ -354,6 +362,9 @@ The `isend` operation returns a handle in the `request` argument for further cal the `isend` operation cannot be initiated, `request` can be set to `NULL` and NCCL will call `isend` again later. +The `pHandle` argument allows NCCL to pass an opaque handle that can be used by the network plugin +to support network defined events. + `irecv` To receive data, NCCL will call `irecv` with the `recvComm` returned by `accept`. The argument @@ -375,6 +386,9 @@ of irecv and is resilient to redundant network writes. This allows the plugin to completions on such irecvs (for example, complete the request immediately). The plugin is still expected to set a valid request pointer on return which NCCL can poll to check for completion. +The `pHandle` argument allows NCCL to pass an array of opaque handles that can be used by the +network plugin to support network defined events. + Note: for a given connection, send/receive operations should always match in the order they were posted. Tags provided for receive operations are only used to assign a given send operation to one of the buffers of the first (multi-)receive in the queue, not to allow for out-of-order tag diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h index 112967ab8..85ea79ef7 100644 --- a/ext-net/example/nccl/net.h +++ b/ext-net/example/nccl/net.h @@ -2,14 +2,15 @@ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ -#ifndef NCCL_NET_H_ -#define NCCL_NET_H_ +#ifndef NET_H_ +#define NET_H_ #include #include #include "common.h" #include "err.h" +#include "net_device.h" #define NCCL_NET_HANDLE_MAXSIZE 128 #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB @@ -22,6 +23,9 @@ // Maximum number of requests per comm object #define NCCL_NET_MAX_REQUESTS 32 +typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData); + +#include "net_v10.h" #include "net_v9.h" #include "net_v8.h" #include "net_v7.h" @@ -31,4 +35,9 @@ #include "net_v3.h" #include "net_v2.h" +typedef ncclNet_v10_t ncclNet_t; +typedef ncclNetProperties_v10_t ncclNetProperties_t; +typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t; +typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t; + #endif // end include guard diff --git a/ext-net/example/nccl/net_device.h b/ext-net/example/nccl/net_device.h index 874fb5999..d693101a3 100644 --- a/ext-net/example/nccl/net_device.h +++ b/ext-net/example/nccl/net_device.h @@ -26,6 +26,7 @@ typedef struct { typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t; typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t; -typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t; +typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t; +typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t; #endif diff --git a/ext-net/example/nccl/net_v10.h b/ext-net/example/nccl/net_v10.h new file mode 100644 index 000000000..809e7c001 --- /dev/null +++ b/ext-net/example/nccl/net_v10.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NET_V10_H_ +#define NET_V10_H_ + +#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4 +typedef struct { + int ndevs; + int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10]; +} ncclNetVDeviceProps_v10_t; + + +#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1 +typedef struct { + // Plugin-specific TC value + int trafficClass; +} ncclNetCommConfig_v10_t; + + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int regIsGlobal; // regMr is not tied to a particular comm + int forceFlush; // Force a flush on receives + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload + ncclNetVDeviceProps_v10_t vProps; + size_t maxP2pBytes; // Max transfer size for point-to-point operations + size_t maxCollBytes; // Max transfer size for collective operations +} ncclNetProperties_v10_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); + + // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller + // what index this new vNIC exists at + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props); +} ncclNet_v10_t; + +#endif // end include guard diff --git a/ext-net/example/nccl/net_v2.h b/ext-net/example/nccl/net_v2.h index 0d9c90619..dd9f39b69 100644 --- a/ext-net/example/nccl/net_v2.h +++ b/ext-net/example/nccl/net_v2.h @@ -2,8 +2,8 @@ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ -#ifndef NCCL_NET_V2_H_ -#define NCCL_NET_V2_H_ +#ifndef NET_V2_H_ +#define NET_V2_H_ typedef struct { // Name of the network (mainly for logs) diff --git a/ext-net/example/nccl/net_v3.h b/ext-net/example/nccl/net_v3.h index db1287b47..9002165e0 100644 --- a/ext-net/example/nccl/net_v3.h +++ b/ext-net/example/nccl/net_v3.h @@ -2,8 +2,8 @@ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ -#ifndef NCCL_NET_V3_H_ -#define NCCL_NET_V3_H_ +#ifndef NET_V3_H_ +#define NET_V3_H_ #define NCCL_NET_MAX_REQUESTS_V3 16 diff --git a/ext-net/example/nccl/net_v4.h b/ext-net/example/nccl/net_v4.h index efe482410..41cef56b4 100644 --- a/ext-net/example/nccl/net_v4.h +++ b/ext-net/example/nccl/net_v4.h @@ -2,8 +2,8 @@ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ -#ifndef NCCL_NET_V4_H_ -#define NCCL_NET_V4_H_ +#ifndef NET_V4_H_ +#define NET_V4_H_ #define NCCL_NET_HANDLE_MAXSIZE_V4 64 diff --git a/ext-net/example/nccl/net_v5.h b/ext-net/example/nccl/net_v5.h index b96b6fc6b..47f446c75 100644 --- a/ext-net/example/nccl/net_v5.h +++ b/ext-net/example/nccl/net_v5.h @@ -2,8 +2,8 @@ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ -#ifndef NCCL_NET_V5_H_ -#define NCCL_NET_V5_H_ +#ifndef NET_V5_H_ +#define NET_V5_H_ typedef ncclNetProperties_v6_t ncclNetProperties_v5_t; typedef struct { diff --git a/ext-net/example/nccl/net_v6.h b/ext-net/example/nccl/net_v6.h index fffaf8c62..de90f297c 100644 --- a/ext-net/example/nccl/net_v6.h +++ b/ext-net/example/nccl/net_v6.h @@ -2,10 +2,8 @@ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ -#ifndef NCCL_NET_V6_H_ -#define NCCL_NET_V6_H_ - -#define NCCL_NET_MAX_REQUESTS_V6 8 +#ifndef NET_V6_H_ +#define NET_V6_H_ typedef struct { char* name; // Used mostly for logging. diff --git a/ext-net/example/nccl/net_v7.h b/ext-net/example/nccl/net_v7.h index d607095de..3802a3d78 100644 --- a/ext-net/example/nccl/net_v7.h +++ b/ext-net/example/nccl/net_v7.h @@ -2,10 +2,8 @@ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ -#ifndef NCCL_NET_V7_H_ -#define NCCL_NET_V7_H_ - -#include "net_device.h" +#ifndef NET_V7_H_ +#define NET_V7_H_ typedef struct { char* name; // Used mostly for logging. diff --git a/ext-net/example/nccl/net_v8.h b/ext-net/example/nccl/net_v8.h index 54a61f61b..74eb72dd4 100644 --- a/ext-net/example/nccl/net_v8.h +++ b/ext-net/example/nccl/net_v8.h @@ -2,10 +2,8 @@ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ -#ifndef NCCL_NET_V8_H_ -#define NCCL_NET_V8_H_ - -#include "net_device.h" +#ifndef NET_V8_H_ +#define NET_V8_H_ typedef struct { char* name; // Used mostly for logging. diff --git a/ext-net/example/nccl/net_v9.h b/ext-net/example/nccl/net_v9.h index 61035ecc9..ca60ad651 100644 --- a/ext-net/example/nccl/net_v9.h +++ b/ext-net/example/nccl/net_v9.h @@ -2,18 +2,14 @@ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. */ -#ifndef NCCL_NET_V9_H_ -#define NCCL_NET_V9_H_ - -#include "net_device.h" +#ifndef NET_V9_H_ +#define NET_V9_H_ #define NCCL_NET_MAX_DEVS_PER_NIC_V9 4 -#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9 typedef struct { int ndevs; int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9]; } ncclNetVDeviceProps_v9_t; -typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t; typedef struct { char* name; // Used mostly for logging. @@ -35,8 +31,6 @@ typedef struct { size_t maxCollBytes; // Max transfer size for collective operations } ncclNetProperties_v9_t; -typedef ncclNetProperties_v9_t ncclNetProperties_t; - typedef struct { // Name of the network (mainly for logs) const char* name; @@ -93,7 +87,7 @@ typedef struct { // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller // what index this new vNIC exists at - ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props); + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props); } ncclNet_v9_t; #endif // end include guard diff --git a/ext-net/example/plugin.c b/ext-net/example/plugin.c index 285224261..97a29875d 100644 --- a/ext-net/example/plugin.c +++ b/ext-net/example/plugin.c @@ -11,7 +11,7 @@ int max_requests = NCCL_NET_MAX_REQUESTS; -__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; } +__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; } __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; } __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; } __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; } @@ -52,13 +52,13 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) { } __hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; } -__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; } +__hidden ncclResult_t pluginConnect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; } __hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; } __hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; } __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; } __hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;} -__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { return ncclInternalError; } -__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { return ncclInternalError; } +__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) { return ncclInternalError; } +__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) { return ncclInternalError; } __hidden ncclResult_t pluginIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { return ncclInternalError; } __hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; } __hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; } @@ -70,7 +70,7 @@ __hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { #define PLUGIN_NAME "Plugin" -ncclNet_v9_t ncclNetPlugin_v9 = { +const ncclNet_v10_t ncclNetPlugin_v10 = { .name = PLUGIN_NAME, .init = pluginInit, .devices = pluginDevices, @@ -93,6 +93,51 @@ ncclNet_v9_t ncclNetPlugin_v9 = { .makeVDevice = pluginMakeVDevice, }; +__hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) { + return pluginInit(logFunction, NULL); +} + +__hidden ncclResult_t pluginGetProperties_v9(int dev, ncclNetProperties_v9_t* props) { + return pluginGetProperties(dev, (ncclNetProperties_t*)props); +} + +__hidden ncclResult_t pluginConnect_v9(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm){ + return pluginConnect(dev, NULL, handle, sendComm, sendDevComm); +} + +__hidden ncclResult_t pluginIsend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { + return pluginIsend(sendComm, data, size, tag, mhandle, NULL, request); +} + +__hidden ncclResult_t pluginIrecv_v9(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { + return pluginIrecv(recvComm, n, data, sizes, tags, mhandles, NULL, request); +} + +__hidden ncclResult_t pluginMakeVDevice_v9(int* d, ncclNetVDeviceProps_v9_t* props) { return ncclInternalError; } + +const ncclNet_v9_t ncclNetPlugin_v9 = { + .name = PLUGIN_NAME, + .init = pluginInit_v9, + .devices = pluginDevices, + .getProperties = pluginGetProperties_v9, + .listen = pluginListen, + .connect = pluginConnect_v9, + .accept = pluginAccept, + .regMr = pluginRegMr, + .regMrDmaBuf = pluginRegMrDmaBuf, + .deregMr = pluginDeregMr, + .isend = pluginIsend_v9, + .irecv = pluginIrecv_v9, + .iflush = pluginIflush, + .test = pluginTest, + .closeSend = pluginCloseSend, + .closeRecv = pluginCloseRecv, + .closeListen = pluginCloseListen, + .getDeviceMr = pluginGetDeviceMr, + .irecvConsumed = pluginIrecvConsumed, + .makeVDevice = pluginMakeVDevice_v9, +}; + __hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* props_v8) { ncclNetProperties_t props; ncclResult_t ret = pluginGetProperties(dev, &props); @@ -113,22 +158,22 @@ __hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* pr } __hidden ncclResult_t pluginIsend_v8(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { - return pluginIsend(sendComm, data, (int)size, tag, mhandle, request); + return pluginIsend(sendComm, data, (int)size, tag, mhandle, NULL, request); } __hidden ncclResult_t pluginIrecv_v8(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { size_t sizesOut[NCCL_PLUGIN_MAX_RECVS]; for (int i=0; i static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) { diff --git a/makefiles/common.mk b/makefiles/common.mk index 1b1bb8674..545203a10 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -16,6 +16,7 @@ WERROR ?= 0 PROFAPI ?= 1 NVTX ?= 1 RDMA_CORE ?= 0 +NET_PROFILER ?= 0 NVCC = $(CUDA_HOME)/bin/nvcc @@ -137,3 +138,7 @@ endif ifneq ($(RDMA_CORE), 0) CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1 endif + +ifneq ($(NET_PROFILER), 0) +CXXFLAGS += -DNCCL_ENABLE_NET_PROFILING=1 +endif diff --git a/makefiles/version.mk b/makefiles/version.mk index b02cf909c..df3ee5c68 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 25 -NCCL_PATCH := 1 +NCCL_MINOR := 26 +NCCL_PATCH := 2 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/Makefile b/src/Makefile index b66ebefa2..65da6300b 100644 --- a/src/Makefile +++ b/src/Makefile @@ -10,11 +10,15 @@ include ../makefiles/version.mk INCEXPORTS := nccl.h LIBSRCFILES := \ bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \ - init.cc init_nvtx.cc net.cc proxy.cc transport.cc mnnvl.cc \ + init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc \ $(wildcard graph/*.cc) \ $(wildcard misc/*.cc) \ $(wildcard transport/*.cc) \ $(wildcard register/*.cc) \ + $(wildcard plugin/*.cc) \ + $(wildcard plugin/net/*.cc) \ + $(wildcard plugin/tuner/*.cc) \ + $(wildcard plugin/profiler/*.cc) \ $(filter-out ras/client.cc,$(wildcard ras/*.cc)) BINSRCFILES := ras/client.cc @@ -49,6 +53,7 @@ LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o) BINOBJ := $(BINSRCFILES:%.cc=$(OBJDIR)/%.o) DEPFILES := $(LIBOBJ:%.o=%.d) $(BINOBJ:%.o=%.d) LDFLAGS += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl +INCPLUGIN := include/plugin DEVMANIFEST := $(BUILDDIR)/obj/device/manifest @@ -126,8 +131,8 @@ $(PKGDIR)/%.pc : %.pc $(OBJDIR)/%.o : %.cc $(INCTARGETS) @printf "Compiling %-35s > %s\n" $< $@ mkdir -p `dirname $@` - $(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@ - @$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp) + $(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -c $< -o $@ + @$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -M $< > $(@:%.o=%.d.tmp) @sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d) @sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \ sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d) diff --git a/src/bootstrap.cc b/src/bootstrap.cc index 675bcfcd4..9e24faadf 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -153,7 +153,7 @@ static ncclResult_t netIsend(ncclNet_t* net, void* sendComm, void* data, int siz int* done) { if (*done) return ncclSuccess; if (!*sendReq) { - NCCLCHECK(net->isend(sendComm, data, (size_t)size, tag, dataHandle, sendReq)); + NCCLCHECK(net->isend(sendComm, data, (size_t)size, tag, dataHandle, NULL, sendReq)); } if (*sendReq) { NCCLCHECK(net->test(*sendReq, done, NULL)); @@ -167,8 +167,8 @@ static ncclResult_t netIrecv(ncclNet_t* net, void* recvComm, void* data, int siz int* done) { if (*done) return ncclSuccess; if (!*recvReq) { - size_t size64 = size; - NCCLCHECK(net->irecv(recvComm, 1, &data, &size64, &tag, &dataHandle, recvReq)); + size_t size64 = size; + NCCLCHECK(net->irecv(recvComm, 1, &data, &size64, &tag, &dataHandle, NULL, recvReq)); } if (*recvReq) { NCCLCHECK(net->test(*recvReq, done, NULL)); @@ -484,7 +484,7 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) { if (devOOB < 0) { pthread_mutex_lock(&bootstrapNetLock); if (devOOB < 0) { - char* userIfEnv = getenv("NCCL_OOB_NET_IFNAME"); + const char* userIfEnv = ncclGetEnv("NCCL_OOB_NET_IFNAME"); if (userIfEnv && strlen(userIfEnv) > 0) { INFO(NCCL_BOOTSTRAP | NCCL_ENV, "NCCL_OOB_NET_IFNAME set to %s", userIfEnv); bool searchNot = userIfEnv && userIfEnv[0] == '^'; @@ -540,7 +540,7 @@ static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* lis do { NCCLCHECK(checkAbort(abortFlag, &abortCounter)); if (!*sendComm) - NCCLCHECK(net->connect(listen->net.dev, peerHandle, sendComm, sendDevHandle)); + NCCLCHECK(net->connect(listen->net.dev, NULL, peerHandle, sendComm, sendDevHandle)); if (!*recvComm) NCCLCHECK(net->accept(listen->net.comm, recvComm, recvDevHandle)); } while (!*sendComm || !*recvComm); @@ -736,6 +736,8 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) { rasRanks[rank].pid = getpid(); rasRanks[rank].cudaDev = comm->cudaDev; rasRanks[rank].nvmlDev = comm->nvmlDev; + rasRanks[rank].hostHash = getHostHash(); + rasRanks[rank].pidHash = getPidHash(); if (ncclRasCommInit(comm, rasRanks+rank) != ncclSuccess) { INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error"); // We should still participate in the ringAllInfo below as the peers will be waiting for us. @@ -967,7 +969,7 @@ ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int s NCCLCHECK(socketAccept(commState, peer, tag, &sock)); TRACE(NCCL_BOOTSTRAP, "Receiving tag=%d peer=%d size=%d", tag, peer, size); NCCLCHECKGOTO(socketRecv(&sock, ((char*)data), size), ret, fail); - NCCLCHECK(ncclSocketClose(&sock)); + NCCLCHECKGOTO(ncclSocketClose(&sock, /*wait*/true), ret, fail); return ret; fail: (void)ncclSocketClose(&sock); @@ -1062,7 +1064,7 @@ static ncclResult_t bootstrapP2PBarrier(void* commState, int* ranks, int rank, i * Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet, * "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988" */ - int data[1]; + int data[1] = {0}; for (int mask = 1; mask < nranks; mask <<= 1) { int src = (rank - mask + nranks) % nranks; int dst = (rank + mask) % nranks; diff --git a/src/channel.cc b/src/channel.cc index b3a8f29b5..bc48986d8 100644 --- a/src/channel.cc +++ b/src/channel.cc @@ -20,8 +20,8 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) { channel->workFifoProduced = 0; struct ncclSharedResources* sharedRes = comm->sharedRes; - - NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream)); + cudaStream_t deviceStream; + NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream)); if (channel->peers == NULL) { // The extra on nRanks+1 is for collnet root (i.e. network) @@ -39,33 +39,33 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) { if (channel->devPeers == NULL) { if (sharedRes->devPeers[channelId] == NULL) { - NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, deviceStream)); } /* channel->devPeers is not shared, so just free it when calling commFree() */ - NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, deviceStream)); ncclCommPushCudaFree(comm, channel->devPeers); NCCLCHECK(ncclCalloc(&channel->devPeersHostPtr, nPeers)); for (int r = 0; r < nRanks; r++) { uintptr_t addr = (uintptr_t)(comm->sharedRes->devPeers[channelId] + comm->topParentRanks[r]); - NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, deviceStream)); channel->devPeersHostPtr[r] = (struct ncclDevChannelPeer*)addr; } } channel->ring.userRanks = ncclMemoryStackAlloc(&comm->memPermanent, nRanks); - NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, deviceStream)); ncclCommPushCudaFree(comm, channel->devRingUserRanks); /* guarantee addr has been copied into channel->devPeers */ + NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false)); NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream)); - NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream)); - return ncclSuccess; } ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) { struct ncclChannel* channel = &comm->channels[channelId]; struct ncclSharedResources* sharedRes = comm->sharedRes; + cudaStream_t deviceStream; if (channel->nvlsPeers != NULL) return ncclSuccess; @@ -73,7 +73,7 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo if (channel->id == -1) NCCLCHECK(initChannel(comm, channelId)); - NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream)); + NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream)); int nvlsRanks = comm->localRanks; @@ -84,24 +84,24 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo int tr = comm->topParentLocalRanks[r]; uintptr_t addr = (uintptr_t)(parent->channels[channelId].nvlsDevPeers + tr); channel->peers[comm->nRanks + 1 + r] = parent->channels[channelId].nvlsPeers + tr; - NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, deviceStream)); channel->devPeersHostPtr[comm->nRanks + 1 + r] = (struct ncclDevChannelPeer*)addr; ncclAtomicRefCountIncrement(&parent->channels[channelId].nvlsPeers[tr].refCount); } } else { NCCLCHECK(ncclCalloc(&channel->nvlsPeers, nvlsRanks)); - NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, nvlsRanks, sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, nvlsRanks, deviceStream)); for (int r = 0; r < nvlsRanks; ++r) { uintptr_t addr = (uintptr_t)(channel->nvlsDevPeers + r); channel->peers[comm->nRanks + 1 + r] = channel->nvlsPeers + r; - NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, deviceStream)); channel->devPeersHostPtr[comm->nRanks + 1 + r] = (struct ncclDevChannelPeer*)addr; ncclAtomicRefCountIncrement(&channel->nvlsPeers[r].refCount); } } + NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false)); NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream)); - NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream)); return ncclSuccess; } @@ -110,6 +110,7 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc struct ncclChannel* channel = &comm->channels[channelId]; struct ncclSharedResources* sharedRes = comm->sharedRes; uintptr_t addr; + cudaStream_t deviceStream; if (channel->collnetPeers != NULL) return ncclSuccess; @@ -117,28 +118,28 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc if (channel->id == -1) NCCLCHECK(initChannel(comm, channelId)); - NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream)); + NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream)); if (share) { channel->collnetPeers = parent->channels[channelId].collnetPeers; channel->collnetDevPeers = parent->channels[channelId].collnetDevPeers; addr = (uintptr_t)parent->channels[channelId].collnetDevPeers; channel->peers[comm->nRanks] = parent->channels[channelId].collnetPeers; - NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, deviceStream)); channel->devPeersHostPtr[comm->nRanks] = (struct ncclDevChannelPeer*)addr; ncclAtomicRefCountIncrement(&parent->channels[channelId].collnetPeers->refCount); } else { NCCLCHECK(ncclCalloc(&channel->collnetPeers, 1)); - NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, deviceStream)); addr = (uintptr_t)channel->collnetDevPeers; channel->peers[comm->nRanks] = channel->collnetPeers; - NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream)); + NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, deviceStream)); channel->devPeersHostPtr[comm->nRanks] = (struct ncclDevChannelPeer*)addr; ncclAtomicRefCountIncrement(&channel->collnetPeers->refCount); } + NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false)); NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream)); - NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream)); return ncclSuccess; } diff --git a/src/debug.cc b/src/debug.cc index 2ea6eabde..2eb8d7749 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -6,6 +6,7 @@ #include "core.h" #include "nccl_net.h" +#include #include #include #include @@ -16,6 +17,11 @@ #include "param.h" int ncclDebugLevel = -1; +static uint32_t ncclDebugTimestampLevels = 0; // bitmaps of levels that have timestamps turned on +static char ncclDebugTimestampFormat[256]; // with space for subseconds +static int ncclDebugTimestampSubsecondsStart; // index where the subseconds starts +static uint64_t ncclDebugTimestampMaxSubseconds; // Max number of subseconds plus 1, used in duration ratio +static int ncclDebugTimestampSubsecondDigits; // Number of digits to display static int pid = -1; static char hostname[1024]; thread_local int ncclDebugNoWarn = 0; @@ -112,6 +118,84 @@ static void ncclDebugInit() { ncclWarnSetDebugInfo = value; } + // Determine which debug levels will have timestamps. + const char* timestamps = ncclGetEnv("NCCL_DEBUG_TIMESTAMP_LEVELS"); + if (timestamps == nullptr) { + ncclDebugTimestampLevels = (1< sizeof(ncclDebugTimestampFormat) - 1) { + // Won't fit; fall back on the default. + break; + } + ncclDebugTimestampSubsecondsStart = i; + ncclDebugTimestampMaxSubseconds = 1; + + memcpy(ncclDebugTimestampFormat, tsFormat, i); + for (int j=0; j>(delta).count()*1000; - len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", - hostname, pid, tid, cudaDev, timestamp, filefunc, line); + len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] %f %s:%d NCCL TRACE ", cudaDev, timestamp, filefunc, line); } + len = std::min(len, sizeof(buffer)-1); // prevent overflows + // Add the message as given by the call site. va_list vargs; va_start(vargs, fmt); len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs); va_end(vargs); // vsnprintf may return len >= sizeof(buffer) in the case of a truncated output. - // Rewind len so that we can replace the final \0 by \n - if (len >= sizeof(buffer)) len = sizeof(buffer)-1; - if (len) { - buffer[len++] = '\n'; - fwrite(buffer, 1, len, ncclDebugFile); - } + // Rewind len so that we can replace the final \0 by "\n" + len = std::min(len, sizeof(buffer)-1); // prevent overflows + + // Add a newline and write it to the debug file. No terminating null is + // necessary since we write bytes instead of the string. + buffer[len++] = '\n'; + fwrite(buffer, 1, len, ncclDebugFile); } NCCL_API(void, ncclResetDebugInit); diff --git a/src/device/all_gather.h b/src/device/all_gather.h index 5d79d7357..854ebbf3a 100644 --- a/src/device/all_gather.h +++ b/src/device/all_gather.h @@ -67,7 +67,7 @@ namespace { offset = dataOffset + rankDest * count; // Final wait/copy. - prims.directRecv(offset, offset, nelem); + prims.directRecv(offset, nelem); } } else if (inputBuf != outputBuf + ringRanks[0] * count) { inputBuf = inputBuf + partOffset; @@ -111,25 +111,63 @@ struct RunWorkColl struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { +#if __CUDA_ARCH__ >= 600 using Proto = ProtoSimple<1, 1>; const int nranks = ncclShmem.comm.nRanks; const int rank = ncclShmem.comm.rank; size_t count, channelOffset, channelCount, chunkCount; ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount); - T *inputBuf = (T*)work->sendbuff; - T *outputBuf = (T*)work->recvbuff; - Primitives, 0, Proto, 0> prims - (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatAg); - - PatAGAlgorithm patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks); - int last = 0; - while (!last) { - int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem; - size_t inpIx, outIx; - patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last); - prims.patCopy(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend); + static constexpr int nworkers = NCCL_PAT_NWORKERS; + struct ncclPatShmem* shmem = (struct ncclPatShmem*)ncclScratchForWarp(0); + uint64_t pollCount = 0; + __syncthreads(); // Don't start using shared mem until everyone arrives + for (int i=tid; ipatSteps[i].flags = 0; + if (tid == 0) shmem->localAccSize = 0; + if (tid == nworkers) shmem->parallelFactor = 0; + __syncthreads(); + + if (tid == nworkers) { // Algo computation thread + PatAGAlgorithm patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks); + int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor(); + int step = 0; + while (1) { + struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS); + cuda::atomic_ref poll(ps->flags); + while (poll.load(cuda::memory_order_acquire) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS' + patAlgo.getNextOp(ps); + int last = ps->last; + step++; + if (last == 2) break; + } + } else if (tid < nworkers) { // Worker threads + T *inputBuf = (T*)work->sendbuff; + T *outputBuf = (T*)work->recvbuff; + int parallelFactor = 0; + volatile int* pfPtr = &shmem->parallelFactor; + while (parallelFactor == 0) parallelFactor = *pfPtr; + + int groupSize = nworkers/(WARP_SIZE*parallelFactor) * WARP_SIZE; + int group = tid / groupSize; + int nGroups = nworkers / groupSize; + int tidInGroup = tid - group*groupSize; + // We don't use recvPeers/sendPeers so let's pass shmem structs instead + Primitives, 0, Proto, 0> prims + (tidInGroup, groupSize, (int*)shmem->recvDims, (int*)shmem->sendDims, inputBuf, outputBuf, work->redOpArg, group, 0, 0, nullptr, nullptr, 0, primsModePatAg); + + int step = group; + while(1) { + struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS); + cuda::atomic_ref poll(ps->flags); + while (poll.load(cuda::memory_order_acquire) == 0) pollCount++; // Wait for compute thread + int last = ps->last; + prims.patCopy(ps, shmem); + if (tidInGroup == 0) poll.store(0, cuda::memory_order_release); // Return element to compute thread + if (last) break; + step += nGroups; + } } +#endif } }; diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h index 216159747..81da55401 100644 --- a/src/device/all_reduce.h +++ b/src/device/all_reduce.h @@ -78,7 +78,7 @@ namespace { offset = gridOffset + elemOffset + chunkOffset; nelem = (int)min(chunkCount, remCount - chunkOffset); - prims.directRecv(offset, offset, nelem); + prims.directRecv(offset, nelem); } } @@ -132,7 +132,7 @@ namespace { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); - prims.directRecv(offset, offset, nelem); + prims.directRecv(offset, nelem); } } else { @@ -215,7 +215,7 @@ namespace { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); - prims.directRecv(offset, offset, nelem); + prims.directRecv(offset, nelem); } } else { @@ -710,7 +710,7 @@ struct RunWorkCollchannels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter; } } diff --git a/src/device/primitives.h b/src/device/primitives.h index 73c10c264..3b9f169f7 100644 --- a/src/device/primitives.h +++ b/src/device/primitives.h @@ -12,7 +12,7 @@ #include "common_kernel.h" #include "common.h" -#define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000 +#define NCCL_SPINS_BEFORE_CHECK_ABORT 10000 /* Protocol classes: ProtoSimple, ProtoLL, ProtoLL128 * We use these as template args to the Primtiives class instead of integral @@ -115,7 +115,7 @@ struct PrimitivesWithoutDirect { __device__ void directSendFromOutput(intptr_t outIx, int eltN) { static_cast(this)->sendFromOutput(outIx, eltN); } - __device__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN) { + __device__ void directRecv(intptr_t outIx, int eltN) { static_cast(this)->recv(outIx, eltN, /*postOp=*/false); } __device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { @@ -139,6 +139,18 @@ struct PrimitivesWithoutDirect { } }; +__device__ inline int checkAbort(int &abortCache, const int abortValue, int &spins) { + if (abortCache & abortValue) return 1; + if (++spins < NCCL_SPINS_BEFORE_CHECK_ABORT) return 0; + spins = 0; + int abort = *ncclShmem.comm.abortFlag; + if (abort) { + ncclShmem.aborted = abort; + abortCache |= abortValue; + } + return abort; +} + #include "prims_simple.h" #include "prims_ll.h" #include "prims_ll128.h" diff --git a/src/device/prims_ll.h b/src/device/prims_ll.h index 3e00f3b85..2a0f5564b 100644 --- a/src/device/prims_ll.h +++ b/src/device/prims_ll.h @@ -51,23 +51,14 @@ class Primitives: } } - uint32_t abort = 0; - - inline __device__ int checkAbort(int &spins, int send) { - spins++; - if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { - abort = *ncclShmem.comm.abortFlag; - spins = 0; - } - return abort; - } + int abort = 0; inline __device__ void waitSend(int nbytes) { if (sendConnHeadPtr) { int spins = 0; while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) { sendConnHeadCache = *sendConnHeadPtr; - if (checkAbort(spins, 1)) break; + if (checkAbort(abort, 1, spins)) break; } if (sendConnFifo) { int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? stepLines*sizeof(union ncclLLFifoLine) : nbytes; @@ -102,7 +93,7 @@ class Primitives: int spins = 0; do { asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4) : "memory"); - if (checkAbort(spins, 0)) break; + if (checkAbort(abort, 1, spins)) break; } while ((flag1 != flag) || (flag2 != flag)); uint64_t val64 = data1 + (((uint64_t)data2) << 32); return val64; @@ -126,7 +117,7 @@ class Primitives: int spins = 0; while (line[i].flag1 != flag || line[i].flag2 != flag) { asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4) : "memory"); - if (checkAbort(spins, 0)) break; + if (checkAbort(abort, 1, spins)) break; } uint64_t val64 = line[i].data1 + (((uint64_t)line[i].data2) << 32); return val64; diff --git a/src/device/prims_ll128.h b/src/device/prims_ll128.h index 617b7acf3..6985e6771 100644 --- a/src/device/prims_ll128.h +++ b/src/device/prims_ll128.h @@ -53,23 +53,14 @@ class Primitives: barrier_sync(15-group, nthreads); } - uint32_t abort = 0; - - inline __device__ int checkAbort(int &spins, int i, int send) { - spins++; - if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { - abort = *ncclShmem.comm.abortFlag; - spins = 0; - } - return abort; - } + int abort = 0; inline __device__ void waitSend(int nbytes) { if (sendConnHeadPtr) { int spins = 0; while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) { sendConnHeadCache = *sendConnHeadPtr; - if (checkAbort(spins, wid, 1)) break; + if (checkAbort(abort, 1, spins)) break; } if (sendConnFifo) { sendConnFifo[sendStep[wid]%NCCL_STEPS].size = nbytes; @@ -201,7 +192,7 @@ class Primitives: load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]); needReload |= flagThread && (vr[u+1] != flag); } - needReload &= (0 == checkAbort(spins, 0, 0)); + needReload &= (0 == checkAbort(abort, 1, spins)); } while (__any_sync(WARP_MASK, needReload)); #pragma unroll @@ -248,7 +239,7 @@ class Primitives: load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]); needReload |= flagThread && (vr[u+1] != flag); } - needReload &= (0 == checkAbort(spins, i, 0)); + needReload &= (0 == checkAbort(abort, 1, spins)); } while (__any_sync(WARP_MASK, needReload)); #pragma unroll diff --git a/src/device/prims_simple.h b/src/device/prims_simple.h index 005101940..cf3ba9b55 100644 --- a/src/device/prims_simple.h +++ b/src/device/prims_simple.h @@ -52,7 +52,7 @@ class Primitives< uint64_t connStepCache; // Cache last seen value of (*connStepPtr) int connStepSize; // Connection step size void* netDeviceHandle; - uint64_t accSize; // Accumulated size. Used by PAT operations + uint64_t accSize; // Don't use barrier 0 as it's used by the final sync __device__ void barrier() { @@ -70,6 +70,11 @@ class Primitives< } } + // PAT uses a single barrier across all groups + __device__ void patBarrier() { + barrier_sync(15, NCCL_PAT_NWORKERS); + } + __device__ bool barrierAny(int vote) { if (nthreads == WARP_SIZE) { return __any_sync(~0u, vote); @@ -87,18 +92,6 @@ class Primitives< } } - inline __device__ bool checkAbort(int &spins) { - spins++; - if (!(flags & Aborted) && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { - if (*ncclShmem.comm.abortFlag) { - flags |= Aborted; - ncclShmem.aborted = 1; - } - spins = 0; - } - return flags & Aborted; - } - inline __device__ uint64_t loadStepValue(uint64_t* ptr) { #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010 if (flags & NvlsMinPolling) { @@ -121,7 +114,7 @@ class Primitives< int spins = 0; while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) { connStepCache = loadStepValue(connStepPtr); - if (checkAbort(spins)) break; + if (checkAbort(flags, Aborted, spins)) break; //if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice)); } } @@ -338,13 +331,8 @@ class Primitives< peerPtr->recv[connIndex].step += steps; st_relaxed_sys_global(peerPtr->recv[connIndex].head, peerPtr->recv[connIndex].step); while (ld_volatile_global(peerPtr->recv[connIndex].tail) < peerPtr->recv[connIndex].step) { - if (spins++ == NCCL_SPINS_BEFORE_CHECK_ABORT) { - if (*ncclShmem.comm.abortFlag) { - ncclShmem.aborted = 1; - break; - } - spins = 0; - } + int abort = 0; + if (checkAbort(abort, 1, spins)) break; } } @@ -359,7 +347,7 @@ class Primitives< int spins = 0; while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) { connStepCache = loadStepValue(connStepPtr); - if (checkAbort(spins)) break; + if (checkAbort(flags, Aborted, spins)) break; } void **ptrs = isSendNotRecv ? ncclShmem.groups[group].dsts : ncclShmem.groups[group].srcs; @@ -601,13 +589,13 @@ class Primitives< tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group), stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) { - // For send operations, we need an extra warp to overlap the threadfence and the copy - this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0); - int peer = -1; flags = 0; index = -1; if (mode == primsModeDefault) { // Connect to ranks in sendPeers/recvPeers + // For send operations, we need an extra warp to overlap the threadfence and the copy + this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0); + int nrecv=0, nsend=0; // Yes, for some template arguments this code will be unreachable. That's fine. // coverity[dead_error_line] @@ -637,68 +625,84 @@ class Primitives< if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index]; if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index]; + + // Coverity thinks that index could be -1 here but that's not actually the case. + // coverity[negative_returns:FALSE] + int sendIpcReg; + int recvIpcReg; + int sendNetReg; + int recvNetReg; + if (P2p) { + sendIpcReg = p2pWork ? p2pWork->sendIpcReg : 0; + recvIpcReg = p2pWork ? p2pWork->recvIpcReg : 0; + sendNetReg = p2pWork ? p2pWork->sendNetReg : 0; + recvNetReg = p2pWork ? p2pWork->recvNetReg : 0; + } else { + recvIpcReg = sendIpcReg = collWork ? collWork->regUsed : 0; + recvNetReg = sendNetReg = collWork ? collWork->netRegUsed : 0; + } + + // coverity[overrun-call] => Coverity think prims.index can be greater than 1 + if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg); + // coverity[overrun-call] => Coverity think prims.index can be greater than 1 + if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg); + + if (barrierAny(flags & NetDeviceUnpack)) { + flags |= AnyNetDeviceUnpack; + // RoleWaitRecv starts at tid=0, so this creates the bitmask of which recv peers + // have NetDeviceUnpack. + uint32_t mask = __ballot_sync(~0u, ((flags & RoleWaitRecv) && (flags & NetDeviceUnpack)) ? 1 : 0); + if (tid == 0) { + ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask; + } + } + + // coverity[negative_returns:FALSE] => coverity thinks that index could be -1 but that's not actually the case + // coverity[var_deref_model] => coverity thinks work can dereferenced if NULL but this is not the case + setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)collWork, sendIpcReg || recvIpcReg, peer); + // coverity[uninit_member] => coverity thinks fan.n is not initialized } else if (mode == primsModePatRs || mode == primsModePatAg) { // Connect to all ranks +/- 2^n flags |= PatMode; - accSize = 0; + const int roles[5] = { RoleWaitRecv, RolePostRecv, RoleWaitSend, RolePostSend, RoleInput | RoleOutput }; + if (tid < 5) flags |= roles[tid]; + int nranks = ncclShmem.comm.nRanks; - int rank = ncclShmem.comm.rank; - // A thread is responsible for rank +/- 2 ^ (tid%32). That should be fine as long as rank is a 32-bits integer. - index = tid % 32; - uint32_t delta = 1 << index; - const int roles[4] = { RoleWaitRecv, RoleWaitSend, RolePostSend, RolePostRecv}; - int block = tid / 32; - if (block < 4 && delta < nranks) { - int role = roles[block]; - if (mode == primsModePatRs) { - if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank - delta + nranks) % nranks; - if (role & (RoleWaitSend|RolePostSend)) peer = (rank + delta) % nranks; - } else if (mode == primsModePatAg) { - if (role & (RoleWaitSend|RolePostSend)) peer = (rank - delta + nranks) % nranks; - if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank + delta) % nranks; - } - flags |= role; - } else if (tid == 128) { - flags |= RoleInput | RoleOutput; // Only one will be used depending on the operation + if (tid < 32 && ((1UL<conn = ncclShmem.channel.peers[recvPeer]->recv+connIndexRecv; + peer->step = conn->step; + peer->buff = conn->buffs[NCCL_PROTO_SIMPLE]; + peer->stepCache = loadStepValue(peer->tailPtr = conn->tail); + peer->headPtr = conn->head; + peer->accSize = 0; + peer->connStepSize = conn->stepSize/sizeof(T); + // Load send peer + int sendPeer = mode == primsModePatAg ? (rank - delta + nranks) % nranks : (rank + delta) % nranks; + peer = ((struct ncclPatPeer*)sendPeers)+tid; + conn = peer->conn = ncclShmem.channel.peers[sendPeer]->send+connIndexSend; + peer->step = conn->step; + peer->connFifo = conn->connFifo; + peer->buff = conn->buffs[NCCL_PROTO_SIMPLE]; + peer->stepCache = loadStepValue(peer->headPtr = conn->head); + peer->tailPtr = conn->tail; + peer->accSize = 0; + peer->connStepSize = conn->stepSize/sizeof(T); } - } - - // Coverity thinks that index could be -1 here but that's not actually the case. - // coverity[negative_returns:FALSE] - int sendIpcReg; - int recvIpcReg; - int sendNetReg; - int recvNetReg; - if (P2p) { - sendIpcReg = p2pWork ? p2pWork->sendIpcReg : 0; - recvIpcReg = p2pWork ? p2pWork->recvIpcReg : 0; - sendNetReg = p2pWork ? p2pWork->sendNetReg : 0; - recvNetReg = p2pWork ? p2pWork->recvNetReg : 0; - } else { - recvIpcReg = sendIpcReg = collWork ? collWork->regUsed : 0; - recvNetReg = sendNetReg = collWork ? collWork->netRegUsed : 0; - } - // coverity[overrun-call] => Coverity think prims.index can be greater than 1 - if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg); - // coverity[overrun-call] => Coverity think prims.index can be greater than 1 - if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg); - - if (barrierAny(flags & NetDeviceUnpack)) { - flags |= AnyNetDeviceUnpack; - // RoleWaitRecv starts at tid=0, so this creates the bitmask of which recv peers - // have NetDeviceUnpack. - uint32_t mask = __ballot_sync(~0u, ((flags & RoleWaitRecv) && (flags & NetDeviceUnpack)) ? 1 : 0); - if (tid == 0) { - ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask; + if (tid==0) { + ncclShmem.groups[group].userInput = (void*)inputBuf; + ncclShmem.groups[group].userOutput = (void*)outputBuf; + ncclShmem.redOpArgs[0] = redOpArg; // scaler for local input } + patBarrier(); } - - // coverity[negative_returns:FALSE] => coverity thinks that index could be -1 but that's not actually the case - // coverity[var_deref_model] => coverity thinks work can dereferenced if NULL but this is not the case - setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)collWork, sendIpcReg || recvIpcReg, peer); - // coverity[uninit_member] => coverity thinks fan.n is not initialized } __device__ ~Primitives() { + if (flags&PatMode) return; // Save steps for the next operation if (flags & (RolePostSend|RolePostRecv)) conn->step = step; if ((flags & NetRegMode) && (flags & RoleWaitSend)) { @@ -708,7 +712,7 @@ class Primitives< uint64_t prevStep = step - StepPerSlice; volatile ssize_t* ptr = &(connFifo[prevStep%NCCL_STEPS].size); int spins = 0; - while (*ptr != -1) if (checkAbort(spins)) break; + while (*ptr != -1) if (checkAbort(flags, Aborted, spins)) break; } if (flags & NetDeviceUnpack) { @@ -726,7 +730,7 @@ class Primitives< int spins = 0; volatile uint64_t* tail = conn->tail; volatile uint64_t* head = conn->head; - while (*tail > *head) if (checkAbort(spins)) break; + while (*tail > *head) if (checkAbort(flags, Aborted, spins)) break; } } @@ -749,7 +753,7 @@ class Primitives< if (slot) { T* exchgPtr; directBuff = (T*)outputBuf; - while (*slot != nullptr && !checkAbort(spins)); + while (*slot != nullptr && !checkAbort(flags, Aborted, spins)); if (P2p) { exchgPtr = (T*)outputBuf; } else { @@ -766,7 +770,7 @@ class Primitives< void* ptr; while (slot) { ptr = *slot; - if (ptr != nullptr || checkAbort(spins)) break; + if (ptr != nullptr || checkAbort(flags, Aborted, spins)) break; } if (slot) { @@ -785,7 +789,7 @@ class Primitives< // Wait for consumer to consume previous value before trampling it. if (slot && argSlot0 && argSlot1) { T* exchgPtr; - while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 != 0) && !checkAbort(spins)); + while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 != 0) && !checkAbort(flags, Aborted, spins)); // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter) // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend) directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf; @@ -815,7 +819,7 @@ class Primitives< void* ptr; while (slot) { ptr = *slot; - if (ptr != nullptr || checkAbort(spins)) break; + if (ptr != nullptr || checkAbort(flags, Aborted, spins)) break; } if (slot && argSlot0 && argSlot1) { @@ -826,7 +830,7 @@ class Primitives< while (true) { arg0 = *argSlot0; arg1 = *argSlot1; - if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break; + if ((arg0 != 0 && arg1 != 0) || checkAbort(flags, Aborted, spins)) break; } ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff); } @@ -866,8 +870,8 @@ class Primitives< __device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, eltN, postOp); } - __device__ __forceinline__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { - genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, postOp); + __device__ __forceinline__ void directRecv(intptr_t outIx, int eltN, bool postOp=false) { + genericOp<1, 0, 1, 0, -1, Output>(outIx, outIx, eltN, postOp); } __device__ __forceinline__ void directRecvCopy(intptr_t inpIx, intptr_t outIx, int eltN) { genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, /*postOp=*/false); @@ -945,54 +949,65 @@ class Primitives< ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false); } - __device__ __forceinline__ void patReduce(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int sendStepOffset, int nelem, int postRecv, int postSend) { - nelem = nelem < 0 ? 0 : nelem; + __device__ __forceinline__ void patReduce(struct ncclPatStep* ps, struct ncclPatShmem* shmem) { + if (ps->flags & PatSkipped) { patBarrier(); patBarrier(); return; } // Skipped + int nelem = ps->nelem < 0 ? 0 : ps->nelem; T* userInput = (T*)ncclShmem.groups[group].userInput; T* userOutput = (T*)ncclShmem.groups[group].userOutput; - if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) { - ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + recvOffset; + bool recv = ps->recvDim >= 0 && (flags & (RolePostRecv|RoleWaitRecv)); + bool send = ps->sendDim >= 0 && (flags & (RolePostSend|RoleWaitSend)); + bool postRecv = ps->postRecv && recv; + bool postSend = ps->postSend && send; + struct ncclPatPeer* peer = NULL; + if (recv) { + peer = shmem->recvDims+ps->recvDim; + step = peer->step; + } + if (send) { + peer = shmem->sendDims+ps->sendDim; + step = peer->step; + } + + if (recv && (flags & RoleWaitRecv)) { + ncclShmem.groups[group].srcs[0] = ((T*)peer->buff) + (step%NCCL_STEPS)*peer->connStepSize + ps->recvOffset; int spins = 0; - while (connStepCache < step + StepPerSlice) { - connStepCache = loadStepValue(connStepPtr); - if (checkAbort(spins)) break; + while (peer->stepCache < step + StepPerSlice) { + peer->stepCache = loadStepValue(peer->tailPtr); + if (checkAbort(flags, Aborted, spins)) break; } - if (postRecv) step += StepPerSlice; } - if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) { + if (send && (flags & RoleWaitSend)) { int spins = 0; - while (connStepCache + NCCL_STEPS < step + sendStepOffset + StepPerSlice) { - connStepCache = loadStepValue(connStepPtr); - if (checkAbort(spins)) break; + while (peer->stepCache + NCCL_STEPS < step + ps->stepOffset + StepPerSlice) { + peer->stepCache = loadStepValue(peer->headPtr); + if (checkAbort(flags, Aborted, spins)) break; } - ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + ((step+sendStepOffset)%NCCL_STEPS)*connStepSize) + sendOffset; - if (accSize < sendOffset + nelem + (step+sendStepOffset)*connStepSize) { + ncclShmem.groups[group].dsts[0] = ((T*)peer->buff) + ((step+ps->stepOffset)%NCCL_STEPS)*peer->connStepSize + ps->sendOffset; + if (peer->accSize < ps->sendOffset + nelem + (step+ps->stepOffset)*peer->connStepSize) { // New data, add our own data to it. - ncclShmem.groups[group].srcs[1] = userInput + inpIx; - accSize = sendOffset + nelem + (step+sendStepOffset)*connStepSize; - if (flags & ConnFifoEnabled) - connFifo[(step+sendStepOffset)%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T); + ncclShmem.groups[group].srcs[1] = userInput + ps->inpIx; } else { // There is already data in there, accumulate instead of writing to it. ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0]; } - if (postSend) step += StepPerSlice; } - if (sendPow2 < 0 && (flags & RoleOutput)) { // Destination is our own local buffer - ncclShmem.groups[group].dsts[0] = userOutput + outIx; - if (accSize < outIx + nelem) { + long long int localAccSize = shmem->localAccSize; + if (ps->sendDim < 0 && (flags & RoleOutput)) { // Destination is our own local buffer + ncclShmem.groups[group].dsts[0] = userOutput + ps->outIx; + if (localAccSize < ps->outIx + nelem) { // New data, add our own data to it. - ncclShmem.groups[group].srcs[1] = userInput + inpIx; - accSize = outIx + nelem; + ncclShmem.groups[group].srcs[1] = userInput + ps->inpIx; + localAccSize = ps->outIx + nelem; } else { // There is already data in there, accumulate instead of writing to it. ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0]; } } - barrier(); + patBarrier(); int nSrcs = 2; void** srcs = ncclShmem.groups[group].srcs; - if (recvPow2 < 0) { srcs++; nSrcs--; } // No peer to receive from, remove one source + if (ps->recvDim < 0) { srcs++; nSrcs--; } // No peer to receive from, remove one source int workSize = ncclShmem.aborted ? 0 : nelem; @@ -1000,59 +1015,92 @@ class Primitives< (tid, nthreads, ncclShmem.redOpArgs[0], nullptr, /*postOp=*/false, nSrcs, srcs, 1, ncclShmem.groups[group].dsts, workSize); - barrier(); - if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem); - if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem); + // Store conn step here inside the two barriers to make sure next reload will see the update. + if (postSend && (flags & RolePostSend)) { + if (peer->connFifo) { + peer->connFifo[step%NCCL_STEPS].size = (ps->sendOffset + nelem)*sizeof(T); + } + peer->step = step += StepPerSlice; + st_relaxed_sys_global(&peer->conn->step, step); + } + if (postRecv && (flags & RolePostRecv)) { + peer->step = step += StepPerSlice; + st_relaxed_sys_global(&peer->conn->step, step); // Also save in global mem for next op + } + + // Update accSize + if (ps->sendDim < 0 && (flags & RoleOutput)) atomicMax(&shmem->localAccSize, localAccSize); + if (ps->sendDim >= 0 && (flags & RoleWaitSend)) atomicMax(&peer->accSize, ps->sendOffset + nelem + (step+ps->stepOffset)*peer->connStepSize); + + patBarrier(); + + if (postSend && (flags & RolePostSend)) { + if (nelem > 0 || peer->connFifo) fence_acq_rel_sys(); + st_relaxed_sys_global(peer->tailPtr, step); + } + if (postRecv && (flags & RolePostRecv)) { + st_relaxed_sys_global(peer->headPtr, step); + } } - __device__ __forceinline__ void patCopy(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int recvStepOffset, int nelem, int postRecv, int postSend) { - nelem = nelem < 0 ? 0 : nelem; + __device__ __forceinline__ void patCopy(struct ncclPatStep* ps, struct ncclPatShmem* shmem) { + if (ps->flags & PatSkipped) { patBarrier(); patBarrier(); return; } // Skipped + int nelem = ps->nelem < 0 ? 0 : ps->nelem; T* userInput = (T*)ncclShmem.groups[group].userInput; T* userOutput = (T*)ncclShmem.groups[group].userOutput; - if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) { - ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + ((step+recvStepOffset)%NCCL_STEPS)*connStepSize) + recvOffset; + bool recv = ps->recvDim >= 0 && (flags & (RolePostRecv|RoleWaitRecv)); + bool send = ps->sendDim >= 0 && (flags & (RolePostSend|RoleWaitSend)); + bool postRecv = ps->postRecv && recv; + bool postSend = ps->postSend && send; + struct ncclPatPeer* peer = NULL; + if (recv) { + peer = shmem->recvDims+ps->recvDim; + step = peer->step; + } + if (send) { + peer = shmem->sendDims+ps->sendDim; + step = peer->step; + } + + if (recv && (flags & RoleWaitRecv)) { + ncclShmem.groups[group].srcs[0] = ((T*)peer->buff) + ((step+ps->stepOffset)%NCCL_STEPS)*peer->connStepSize + ps->recvOffset; int spins = 0; - while (connStepCache < step + recvStepOffset + StepPerSlice) { - connStepCache = loadStepValue(connStepPtr); - if (checkAbort(spins)) break; + while (peer->stepCache < step + ps->stepOffset + StepPerSlice) { + peer->stepCache = loadStepValue(peer->tailPtr); + if (checkAbort(flags, Aborted, spins)) break; } - if (accSize < recvOffset + nelem + (step+recvStepOffset)*connStepSize) { + if (peer->accSize < ps->recvOffset + nelem + (step+ps->stepOffset)*peer->connStepSize) { // New data, copy to our output buffer. - ncclShmem.groups[group].dsts[1] = userOutput + outIx; - accSize = recvOffset + nelem + (step+recvStepOffset)*connStepSize; + ncclShmem.groups[group].dsts[1] = userOutput + ps->outIx; } else { ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done } - if (postRecv) step += StepPerSlice; } - if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) { + if (send && (flags & RoleWaitSend)) { int spins = 0; - while (connStepCache + NCCL_STEPS < step + StepPerSlice) { - connStepCache = loadStepValue(connStepPtr); - if (checkAbort(spins)) break; - } - ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + sendOffset; - if (postSend) { - if (flags & ConnFifoEnabled) - connFifo[step%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T); - step += StepPerSlice; + while (peer->stepCache + NCCL_STEPS < step + StepPerSlice) { + peer->stepCache = loadStepValue(peer->headPtr); + if (checkAbort(flags, Aborted, spins)) break; } + ncclShmem.groups[group].dsts[0] = ((T*)peer->buff) + (step%NCCL_STEPS)*peer->connStepSize + ps->sendOffset; } - if (recvPow2 < 0 && (flags & RoleInput)) { // Source is our own local buffer - ncclShmem.groups[group].srcs[0] = userInput + inpIx; - if (accSize < inpIx + nelem) { + long long int localAccSize = shmem->localAccSize; + if (ps->recvDim < 0 && (flags & RoleInput)) { // Source is our own local buffer + ncclShmem.groups[group].srcs[0] = userInput + ps->inpIx; + if (localAccSize < ps->inpIx + nelem) { // New data, copy to our output buffer. - ncclShmem.groups[group].dsts[1] = userOutput + outIx; - accSize = inpIx + nelem; + ncclShmem.groups[group].dsts[1] = userOutput + ps->outIx; + localAccSize = ps->inpIx + nelem; } else { - ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done + // Already done + ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; } } - barrier(); + patBarrier(); int nDsts = 2; void** dsts = ncclShmem.groups[group].dsts; - if (sendPow2 < 0) { dsts++; nDsts--; } // No peer to send to, remove one dest + if (ps->sendDim < 0) { dsts++; nDsts--; } // No peer to send to, remove one dest if (ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[1]) nDsts--; // In-place or already done. int workSize = ncclShmem.aborted ? 0 : nelem; @@ -1061,9 +1109,32 @@ class Primitives< (tid, nthreads, ncclShmem.redOpArgs[0], nullptr, /*postOp=*/false, 1, ncclShmem.groups[group].srcs, nDsts, dsts, workSize); - barrier(); - if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem); - if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem); + // Store conn step here inside the two barriers to make sure next reload will see the update. + if (postSend && (flags & RolePostSend)) { + if (peer->connFifo) { + peer->connFifo[step%NCCL_STEPS].size = (ps->sendOffset + nelem)*sizeof(T); + } + peer->step = step += StepPerSlice; + st_relaxed_sys_global(&peer->conn->step, step); + } + if (postRecv && (flags & RolePostRecv)) { + peer->step = step += StepPerSlice; + st_relaxed_sys_global(&peer->conn->step, step); // Also save in global mem for next op + } + + // Update accSize + if (ps->recvDim < 0 && (flags & RoleInput)) atomicMax(&shmem->localAccSize, localAccSize); + if (ps->recvDim >= 0 && (flags & RoleWaitRecv)) atomicMax(&peer->accSize, ps->recvOffset + nelem + (step+ps->stepOffset)*peer->connStepSize); + + patBarrier(); + + if (postSend && (flags & RolePostSend)) { + if (nelem > 0 || peer->connFifo) fence_acq_rel_sys(); + st_relaxed_sys_global(peer->tailPtr, step); + } + if (postRecv && (flags & RolePostRecv)) { + st_relaxed_sys_global(peer->headPtr, step); + } } }; diff --git a/src/device/reduce_scatter.h b/src/device/reduce_scatter.h index 70538b117..5d8de2819 100644 --- a/src/device/reduce_scatter.h +++ b/src/device/reduce_scatter.h @@ -80,29 +80,66 @@ struct RunWorkColl struct RunWorkColl { __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) { +#if __CUDA_ARCH__ >= 600 using Proto = ProtoSimple<1, 1>; const int nranks = ncclShmem.comm.nRanks; const int rank = ncclShmem.comm.rank; size_t count, channelOffset, channelCount, chunkCount; ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount); - T *inputBuf = (T*)work->sendbuff; - T *outputBuf = (T*)work->recvbuff; - Primitives, 0, Proto, 0> prims - (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatRs); + static constexpr int nworkers = NCCL_PAT_NWORKERS; + struct ncclPatShmem* shmem = (struct ncclPatShmem*)ncclScratchForWarp(0); + uint64_t pollCount = 0; + __syncthreads(); // Don't start using shared mem until everyone arrives + for (int i=tid; ipatSteps[i].flags = 0; + if (tid == 0) shmem->localAccSize = 0; + if (tid == nworkers) shmem->parallelFactor = 0; + __syncthreads(); - PatRSAlgorithm patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks); - int last = 0; - while (!last) { - int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem; - size_t inpIx, outIx; - patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last); - prims.patReduce(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend); + if (tid == nworkers) { // Algo computation thread + PatRSAlgorithm patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks); + int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor(); + int step = 0; + while (1) { + struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS); + cuda::atomic_ref poll(ps->flags); + while (poll.load(cuda::memory_order_acquire) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS' + patAlgo.getNextOp(ps); + int last = ps->last; + step++; + if (last == 2) break; + } + } else if (tid < nworkers) { // Worker threads + T *inputBuf = (T*)work->sendbuff; + T *outputBuf = (T*)work->recvbuff; + int parallelFactor = 0; + volatile int* pfPtr = &shmem->parallelFactor; + while (parallelFactor == 0) parallelFactor = *pfPtr; + + int groupSize = nworkers/(WARP_SIZE*parallelFactor) * WARP_SIZE; + int group = tid / groupSize; + int nGroups = nworkers / groupSize; + int tidInGroup = tid - group*groupSize; + // We don't use recvPeers/sendPeers so let's pass shmem structs instead + Primitives, 0, Proto, 0> prims + (tidInGroup, groupSize, (int*)shmem->recvDims, (int*)shmem->sendDims, inputBuf, outputBuf, work->redOpArg, group, 0, 0, nullptr, nullptr, 0, primsModePatRs); + + int step = group; + while(1) { + struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS); + cuda::atomic_ref poll(ps->flags); + while (poll.load(cuda::memory_order_acquire) == 0) pollCount++; // Wait for compute thread + int last = ps->last; + prims.patReduce(ps, shmem); + if (tidInGroup == 0) poll.store(0, cuda::memory_order_release); // Return element to compute thread + if (last) break; + step += nGroups; + } } +#endif } }; - template struct RunWorkColl { __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) { diff --git a/src/device/sendrecv.h b/src/device/sendrecv.h index fe3b9ca77..f36a511d8 100644 --- a/src/device/sendrecv.h +++ b/src/device/sendrecv.h @@ -41,7 +41,7 @@ struct RunWorkBatch (maxSharedMem-attr.sharedSizeBytes)) { - if (print++ == 0) - INFO(NCCL_INIT, "ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu", - sharedMemSize, maxSharedMem-attr.sharedSizeBytes); - // Reduce requested MaxDynamicSharedMemorySize attribute - sharedMemSize = maxSharedMem - attr.sharedSizeBytes; + WARN("cudaArch %d ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu", + cudaArch, sharedMemSize, maxSharedMem-attr.sharedSizeBytes); + return ncclSystemError; } CUDACHECKGOTO(cudaFuncSetAttribute(fn, cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize), @@ -388,6 +385,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool struct ncclTaskColl* next = aggBeg->next; aggBeg->algorithm = agg.algorithm; aggBeg->protocol = agg.protocol; + if (aggBeg->protocol == NCCL_PROTO_LL) aggBeg->trafficBytes *= 4; aggBeg->nMaxChannels = agg.nMaxChannels; aggBeg->nWarps = agg.nWarps; aggBeg->devFuncId = agg.devFuncId; @@ -478,6 +476,14 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool return ncclSuccess; } +static ncclResult_t addProfilerProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclProxyOp* op) { + int tmp = op->pattern; + op->pattern = ncclPatternProfiler; + ncclResult_t ret = addProxyOpIfNeeded(comm, plan, op); + op->pattern = tmp; + return ret; +} + static ncclResult_t scheduleCollTasksToPlan( struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclKernelPlanBudget* budget ) { @@ -550,11 +556,16 @@ static ncclResult_t scheduleCollTasksToPlan( proxyOp.opCount = proxyOpId; proxyOp.task.coll = task; proxyOp.rank = comm->rank; + proxyOp.eActivationMask = task->eActivationMask; + proxyOp.workCounter = ++comm->profiler.workCounter[c]; addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes); + // Set pattern to profiler to add a proxy profiler for kernel events NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp)); + NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOp)); } } else { // not task->isCollnet int trafficPerByte = ncclFuncTrafficPerByte(task->func, comm->nRanks); + if (task->protocol == NCCL_PROTO_LL) trafficPerByte *= 4; size_t cellSize = divUp(divUp(MinTrafficPerChannel, (size_t)trafficPerByte), 16) * 16; int elementsPerCell = cellSize/elementSize; size_t cells = divUp(task->count*elementSize, cellSize); @@ -669,11 +680,14 @@ static ncclResult_t scheduleCollTasksToPlan( } proxyOp->ringAlgo->incRefCount(); } + proxyOp->eActivationMask = task->eActivationMask; + proxyOp->workCounter = ++comm->profiler.workCounter[c]; addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes); // Coverity reports "proxyOp->connection" as being possibly uninitialized. It's hard to // determine if that's actually true but it's also not clear if that would be an issue. // coverity[uninit_use_in_call:FALSE] NCCLCHECK(addProxyOpIfNeeded(comm, plan, proxyOp)); + NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, proxyOp)); } } @@ -797,7 +811,8 @@ static ncclResult_t addP2pToPlan( if (protocol[dir] == NCCL_PROTO_LL) chunkSize[dir] *= 2; if (network[dir]) { - if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (ncclPxnDisable(comm) || !comm->isAllNvlink)) { + bool pxnUsed = !ncclPxnDisable(comm) && comm->isAllNvlink && comm->maxLocalRanks > 1; + if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (!pxnUsed)) { int regFlag = 0; NCCLCHECK(ncclCalloc(&handles[dir], nChannelsMax)); for (int part = 0; part < nChannelsMax; part++) { @@ -888,6 +903,7 @@ static ncclResult_t addP2pToPlan( op->coll = p2pTasks[dir] ? p2pTasks[dir]->func : 0; op->task.p2p = p2pTasks[dir]; op->rank = comm->rank; + op->eActivationMask = p2pTasks[dir] ? p2pTasks[dir]->eActivationMask : 0; // The following are modified per channel part in addWorkToChannels(): // op->buffer, op->nbytes, op->nsteps = ...; } @@ -898,7 +914,6 @@ static ncclResult_t addP2pToPlan( plan->channelMask |= uint64_t(1)<nSendChannels : work->nRecvChannels; @@ -935,9 +950,12 @@ static ncclResult_t addP2pToPlan( // equal one plus the batch index this p2p settled in. proxyOps[dir].channelId = channelId; proxyOps[dir].opCount = uint64_t(comm->planner.wipPlan.channels[channelId].nWorkBatchesP2p)<<1 | 1; + proxyOps[dir].workCounter = comm->profiler.workCounter[channelId]+1; NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOps[dir])); + NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOps[dir])); } } + comm->profiler.workCounter[channelId] += (proxyOps[0].nsteps || proxyOps[1].nsteps) ? 1 : 0; } return ncclSuccess; @@ -1157,22 +1175,23 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla struct uploadWork_cleanup_t* cleanup = nullptr; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; void* fifoBufDev = nullptr; + cudaStream_t deviceStream; + CUDACHECKGOTO(cudaThreadExchangeStreamCaptureMode(&mode), result, fail); - // Acquire deviceStream to gain access to deviceStream.cudaStream. Since the - // user's graph will be launched later, and it also acquires the deviceStream, - // it will observe this upload. - NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), result, fail); + // Acquire deviceStream. Since the user's graph will be launched later and it also + // acquires the deviceStream, it will observe this upload. + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), result, fail); - CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, comm->sharedRes->deviceStream.cudaStream), result, fail); + CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, deviceStream), result, fail); plan->workBufPersistent = fifoBufDev; plan->kernelArgs->workBuf = fifoBufDev; // coverity[uninit_use_in_call:FALSE] => fifoBufHost is never NULL - CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, comm->sharedRes->deviceStream.cudaStream), result, fail); + CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, deviceStream), result, fail); cudaEvent_t memcpyDone; CUDACHECKGOTO(cudaEventCreateWithFlags(&memcpyDone, cudaEventDisableTiming), result, fail); - CUDACHECKGOTO(cudaEventRecord(memcpyDone, comm->sharedRes->deviceStream.cudaStream), result, fail); + CUDACHECKGOTO(cudaEventRecord(memcpyDone, deviceStream), result, fail); NCCLCHECKGOTO(ncclCalloc(&cleanup, 1), result, fail); cleanup->base.fn = uploadWork_cleanup_fn; @@ -1180,7 +1199,7 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla cleanup->hostBuf = fifoBufHost; ncclIntruQueueEnqueue(&comm->eventCallbackQueue, (struct ncclCommEventCallback *)cleanup); - NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream), result, fail); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), result, fail); NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, fail); finish_scope: @@ -1254,14 +1273,15 @@ static void CUDART_CB hostStreamPlanCallback(void *plan_) { if (result != ncclSuccess) { WARN("hostStreamPlanCallback() failed : %s", ncclGetErrorString(result)); } - if (!plan->persistent) ncclAtomicRefCountDecrement(&plan->comm->noncapturedRefs); + if (!plan->persistent) ncclAtomicRefCountDecrement(&plan->comm->sharedRes->noncapturedRefs); return; } static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* me) { struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim` if (plan->persistent) { - comm->persistentRefs -= 1; + comm->sharedRes->persistentRefs -= 1; + comm->localPersistentRefs -= 1; if (plan->workStorageType == ncclDevWorkStorageTypePersistent) { cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); @@ -1317,6 +1337,28 @@ static void persistentDestructor(void* plans_) { } } +NCCL_PARAM(LaunchOrderImplicit, "LAUNCH_ORDER_IMPLICIT", 0); + +namespace { + enum ncclImplicitOrder { + ncclImplicitOrderNone, + ncclImplicitOrderSerial, + ncclImplicitOrderLaunch + }; +} + +static ncclResult_t getImplicitOrder(enum ncclImplicitOrder *mode, bool capturing, int driver=-1) { + if (ncclParamLaunchOrderImplicit()) { + // Due to an unresolved bug in CUDA ncclImplicitOrderLaunch is not supported in graphs + if (capturing) { *mode = ncclImplicitOrderSerial; return ncclSuccess; } + if (driver < 0) { NCCLCHECK(ncclCudaDriverVersion(&driver)); } + *mode = 12030 <= std::min(CUDART_VERSION, driver) ? ncclImplicitOrderLaunch : ncclImplicitOrderSerial; + return ncclSuccess; + } + *mode = ncclImplicitOrderNone; + return ncclSuccess; +} + ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { ncclResult_t result = ncclSuccess; struct ncclKernelPlanner* planner = &comm->planner; @@ -1364,58 +1406,60 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { if (nPlans == 0) return ncclSuccess; - // Semantically we want these dependencies for the kernels launched: - // 1. Launch host task on hostStream. - // 2. Launch kernel, depends on all of {deviceStream, hostStream, userStream[i]...} - // 3. {deviceStream, userStream[i]...} depend on kernel. - // We achieve this by: - // 1. userStream[0] waits on deviceStream - // 2. deviceStream waits on each of userStream[1...] - // 3. host task launch on hostStream - // 4. userStream[0] waits on hostStream - // 5. kernel launch on userStream[0] - // 6. deviceStream waits on userStream[0] - // 7. userStream[1...] each waits on deviceStream - // The two-level fan-in fan-out is because ncclStrongStreamWaitStream() requires - // at least one of the two streams to be strong-stream. cudaStream_t launchStream = planner->streams->stream; - NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->deviceStream), result, failure); + cudaStream_t deviceStream, launchOrder; + NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), result, failure); - // Create dependency for device stream on user streams. First from extra user - // streams to deviceStream. Then deviceStream to first user stream. + // userStream[0] waits on each userStream[i]... for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) { - NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, &comm->sharedRes->deviceStream, l->stream), result, failure); + CUDACHECKGOTO(cudaEventRecord(comm->sharedRes->scratchEvent, l->stream), result, failure); + CUDACHECKGOTO(cudaStreamWaitEvent(launchStream, comm->sharedRes->scratchEvent, 0), result, failure); + } + // userStream[0] waits on deviceStream + NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, deviceStream, comm->sharedRes->scratchEvent), result, failure); + + bool capturing = ncclCudaGraphValid(planner->capturingGraph); + enum ncclImplicitOrder implicitOrder; + NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing), result, failure); + + if (implicitOrder != ncclImplicitOrderNone) { + // userStream[0] waits on per-device (context) launchOrder. Concurrent strong stream access is + // required if this is a graph capture, non-captured cannot be concurrent because that would violate + // deterministic program order of launches. + bool concurrent = capturing; + NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->context->launchOrder, concurrent, &launchOrder), result, failure); + NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, launchOrder, comm->sharedRes->scratchEvent), result, failure); } - NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure); - if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking || __atomic_load_n(&comm->noncapturedRefs, __ATOMIC_ACQUIRE)) { + if (persistent || comm->sharedRes->persistentRefs != 0 || ncclCudaLaunchBlocking || __atomic_load_n(&comm->sharedRes->noncapturedRefs, __ATOMIC_ACQUIRE)) { // We have to launch host tasks to push proxy args. We are careful to only // do this if necessary since host tasks impose a high performance cost in CUDA. bool acquired = false; + cudaStream_t hostStream; for (struct ncclKernelPlan* plan=planHead; plan != nullptr; plan = plan->next) { if (plan->hasProxyOps) { if (!acquired) { acquired = true; - NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure); + NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), result, failure); } - if (!persistent) ncclAtomicRefCountIncrement(&comm->noncapturedRefs); + if (!persistent) ncclAtomicRefCountIncrement(&comm->sharedRes->noncapturedRefs); plan->isHostCbEnq = true; - NCCLCHECKGOTO(ncclStrongStreamLaunchHost(planner->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure); + CUDACHECKGOTO(cudaLaunchHostFunc(hostStream, hostStreamPlanCallback, plan), result, failure); } } if (acquired) { // Make to-be-launched kernels dependent on just-launched host stream tasks. - NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->hostStream), result, failure); - NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure); + NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, hostStream, comm->sharedRes->scratchEvent), result, failure); + NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->hostStream, /*concurrent=*/false), result, failure); } } if (persistent) { - comm->persistentRefs += nPlans; + comm->sharedRes->persistentRefs += nPlans; + comm->localPersistentRefs += nPlans; NCCLCHECKGOTO(ncclCudaGraphAddDestructor(planner->capturingGraph, persistentDestructor, (void*)planHead), result, failure); } } - failure: return result; } @@ -1434,6 +1478,7 @@ NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote); #endif ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) { + ncclResult_t ret = ncclSuccess; struct ncclKernelPlanner* planner = &comm->planner; int nChannels = countOneBits(plan->channelMask); void* sym = plan->kernelFn; @@ -1447,18 +1492,19 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan CU_LAUNCH_PARAM_END }; + int driverVersion; + NCCLCHECKGOTO(ncclCudaDriverVersion(&driverVersion), ret, do_return); + CUfunction fn; - CUDACHECK(cudaGetFuncBySymbol(&fn, sym)); + CUDACHECKGOTO(cudaGetFuncBySymbol(&fn, sym), ret, do_return); + if (CUDART_VERSION >= 11080 && driverVersion >= 11080) { #if CUDART_VERSION >= 11080 - int driverVersion; - NCCLCHECK(ncclCudaDriverVersion(&driverVersion)); - if (driverVersion >= 11080) { int compCap = comm->compCap; unsigned int clusterSize = (compCap >= 90) ? comm->config.cgaClusterSize : 0; CUlaunchConfig launchConfig = {0}; - CUlaunchAttribute launchAttrs[3]; + CUlaunchAttribute launchAttrs[4] = {}; int attrs = 0; /* Cooperative Group Array (CGA) * On sm90 and later we have an extra level of hierarchy where we @@ -1485,6 +1531,17 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan launchAttrs[attrs++].value.memSyncDomain = (CUlaunchMemSyncDomain) ncclParamMemSyncDomain(); } #endif + #if CUDART_VERSION >= 12030 + bool capturing = ncclCudaGraphValid(planner->capturingGraph); + enum ncclImplicitOrder implicitOrder; + NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing, driverVersion), ret, do_return); + if (implicitOrder == ncclImplicitOrderLaunch) { + launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT; + launchAttrs[attrs].value.launchCompletionEvent.event = comm->sharedRes->launchEvent; + launchAttrs[attrs].value.launchCompletionEvent.flags = 0; + attrs++; + } + #endif launchConfig.gridDimX = grid.x; launchConfig.gridDimY = grid.y; launchConfig.gridDimZ = grid.z; @@ -1496,15 +1553,15 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan launchConfig.numAttrs = attrs; launchConfig.hStream = launchStream; - //CUDACHECK(cudaLaunchKernelExC(&launchConfig, fnAddr, args)); - CUCHECK(cuLaunchKernelEx(&launchConfig, fn, nullptr, extra)); - return ncclSuccess; - } + CUCHECKGOTO(cuLaunchKernelEx(&launchConfig, fn, nullptr, extra), ret, do_return); #endif - // Standard kernel launch - CUCHECK(cuLaunchKernel(fn, grid.x, grid.y, grid.z, block.x, block.y, block.z, smem, launchStream, nullptr, extra)); - //CUDACHECK(cudaLaunchKernel(fnAddr, grid, block, args, smem, launchStream)); - return ncclSuccess; + } else { + // Standard kernel launch + CUCHECKGOTO(cuLaunchKernel(fn, grid.x, grid.y, grid.z, block.x, block.y, block.z, smem, launchStream, nullptr, extra), ret, do_return); + } + +do_return: + return ret; } ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) { @@ -1524,34 +1581,39 @@ ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKern } ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { - ncclResult_t result = ncclSuccess; struct ncclKernelPlanner* planner = &comm->planner; - if (!ncclIntruQueueEmpty(&planner->planQueue)) { // Reset queue to empty without destroying plans since those will be sent // back to us for reclaiming via callbackQueue. ncclIntruQueueConstruct(&planner->planQueue); + cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch - // Create dependency for deviceStream on launchStream. We know that deviceStream - // hasn't been modified since launchStream waited on it (in ncclLaunchPrepare), - // so we can say that launchStream subsumes it. - NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, &comm->sharedRes->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1); - resume1: - // Create dependency for other user streams (skip launch stream) on deviceStream. - // Again, the user streams haven't been touched since deviceStream waited on them - // so we can say they are subsumed by deviceStream. - struct ncclCudaStreamList* sl = planner->streams->next; - planner->streams = nullptr; // Reset comm->planner.streams to empty. - while (sl != nullptr) { - NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, sl->stream, &comm->sharedRes->deviceStream, /*b_subsumes_a=*/true), result, resume2); - resume2: - sl = sl->next; + cudaStream_t deviceStream, launchOrder; + CUDACHECK(cudaEventRecord(comm->sharedRes->scratchEvent, launchStream)); + // deviceStream waits on userStream[0] + NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream)); + CUDACHECK(cudaStreamWaitEvent(deviceStream, comm->sharedRes->scratchEvent, 0)); + // Each userStream[i] waits on userStream[0] + for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) { + CUDACHECK(cudaStreamWaitEvent(l->stream, comm->sharedRes->scratchEvent, 0)); } - // Release device stream as acquired in ncclLaunchPrepare() - NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->deviceStream), result, resume3); - resume3:; + bool capturing = ncclCudaGraphValid(planner->capturingGraph); + enum ncclImplicitOrder implicitOrder; + NCCLCHECK(getImplicitOrder(&implicitOrder, capturing)); + if (implicitOrder != ncclImplicitOrderNone) { + // As in ncclLaunchPrepare, strong stream can be non-concurrent when non-captured. + bool concurrent = capturing; + // Incorporate launch event into per-device (context) launch order. + NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->context->launchOrder, concurrent, &launchOrder)); + // If we don't have launch events (requires CUDA 12.3) then just use completion event (serialize execution). + CUDACHECK(cudaStreamWaitEvent(launchOrder, implicitOrder == ncclImplicitOrderLaunch ? comm->sharedRes->launchEvent : comm->sharedRes->scratchEvent)); + // Release launchOrder as acquired in ncclLaunchPrepare() + NCCLCHECK(ncclStrongStreamRelease(planner->capturingGraph, &comm->context->launchOrder, concurrent)); + } + // Release deviceStream as acquired in ncclLaunchPrepare() + NCCLCHECK(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false)); } - return result; + return ncclSuccess; } /*****************************************************************************/ @@ -1655,11 +1717,11 @@ static ncclResult_t topoGetAlgoInfo( if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) { char ncclAlgoEnvStr[1024] = ""; char ncclProtoEnvStr[1024] = ""; - char* algoEnv = getenv("NCCL_ALGO"); + const char* algoEnv = ncclGetEnv("NCCL_ALGO"); if (algoEnv) { snprintf(ncclAlgoEnvStr, 1023, " NCCL_ALGO was set to %s.", algoEnv); } - char* protoEnv = getenv("NCCL_PROTO"); + const char* protoEnv = ncclGetEnv("NCCL_PROTO"); if (protoEnv) { snprintf(ncclProtoEnvStr, 1023, " NCCL_PROTO was set to %s.", protoEnv); } @@ -2007,7 +2069,7 @@ static ncclResult_t hostToDevRedOp( uint64_t allBits = uint64_t(-1)>>(64-nbits); uint64_t signBit = allBits^(allBits>>1); bool datatype_signed = false; - + switch (int(op)) { case ncclSum: opFull->op = ncclDevSum; break; case ncclProd: opFull->op = ncclDevProd; break; @@ -2097,6 +2159,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { p2p->datatype = info->datatype; p2p->root = info->root; p2p->bytes = nBytes; + p2p->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED); ncclIntruQueueEnqueue( isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue, p2p); @@ -2105,6 +2168,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { // Mark channels that need pre-connect if (comm->rank != peer) { if (!(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen)) { + // planner->peers[peer].send/recvSeen is private to each comm, so we need to set it anyway. (isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen) = true; int round = 0; while (peer != (isSendNotRecv ? comm->p2pSchedule[round].sendRank @@ -2115,12 +2179,17 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { for (int c=0; c < comm->p2pnChannelsPerPeer; c++) { int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, c); if (isSendNotRecv) { - if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { // P2P uses only 1 connector + if (comm->channels[channelId].peers[peer]->send[1].hasSeen == 0) { // P2P uses only 1 connector + // the send/recv connector is shared among split shared comms. We need to set hasSeen to + // 1 in order to avoid duplicate connection setup if user group sendrecv ops with split + // shared comms together. + comm->channels[channelId].peers[peer]->send[1].hasSeen = 1; comm->connectSend[peer] |= (1UL<channels[channelId].peers[peer]->recv[1].connected == 0) { // P2P uses only 1 connector + if (comm->channels[channelId].peers[peer]->recv[1].hasSeen == 0) { // P2P uses only 1 connector + comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1; comm->connectRecv[peer] |= (1UL<opDev = opDev; // C++ struct assignment t->chunkSteps = info->chunkSteps; t->sliceSteps = info->sliceSteps; + t->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED); planner->nTasksColl += 1; ncclTaskCollSorterInsert(&planner->collSorter, t, t->trafficBytes); diff --git a/src/graph/connect.cc b/src/graph/connect.cc index 64fc1c5dd..76b508c2d 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -390,7 +390,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail); // Alternate rings to avoid crossing rails - if (graphs[NCCL_ALGO_RING]->crossNic && (nChannels % 2) == 0) { + if (graphs[NCCL_ALGO_RING]->crossNic == 2 && (nChannels % 2) == 0) { for (int r=0; rnRanks; r++) { if (comm->rankToNode[r] % 2 == 1) { // Exchange rings diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 587a8b282..ace4476f6 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -376,9 +376,12 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2); int ncclTopoUserGdrLevel = -1; +const char* ncclTopoGdrModeStr[ncclTopoGdrModeNum] = { "Disabled", "Default", "PCI" }; -ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, int* useGdr) { - *useGdr = 0; +NCCL_PARAM(NetGdrC2c, "NET_GDR_C2C", 0); + +ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode) { + *gdrMode = ncclTopoGdrModeDisable; // Get GPU and NET int n, g; @@ -418,25 +421,37 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t n int distance = gpu->paths[NET][n].type; if (distance == PATH_PXN) { // In case of PXN, use the intermediate GPU distance instead - int proxyRank, g; + int proxyRank; NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netId, &proxyRank)); NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g)); - struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g; - distance = proxyGpu->paths[NET][n].type; + gpu = system->nodes[GPU].nodes+g; + distance = gpu->paths[NET][n].type; } + + int c; + NCCLCHECK(ncclGetLocalCpu(system, g, &c)); + if (ncclParamNetGdrC2c() && distance == PATH_PHB && gpu->paths[CPU][c].type == PATH_C2C) { + // On C2C platforms we can still use GDRDMA on NICs connected to the CPUs + INFO(NCCL_NET, "GPU %d / HCA %lx connected to CPU %d via C2C link", rank, netId, c); + distance = PATH_C2C; + } + if (distance > netGdrLevel) { INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %d / HCA %lx (distance %d > %d)", rank, netId, distance, netGdrLevel); return ncclSuccess; } - *useGdr = 1; - INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d", rank, netId, distance, netGdrLevel, read); + // Force PCIe mapping if path goes through PCI on a C2C system + if (gpu->paths[CPU][c].type == PATH_C2C && distance != PATH_C2C) *gdrMode = ncclTopoGdrModePci; + else *gdrMode = ncclTopoGdrModeDefault; + + INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d mode %s", rank, netId, distance, netGdrLevel, read, ncclTopoGdrModeStr[*gdrMode]); return ncclSuccess; } ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail) { int netNum = system->nodes[NET].count; - int useGdr = 0; + enum ncclTopoGdrMode useGdr = ncclTopoGdrModeDisable; *avail = false; for (int n = 0; n < netNum; n++) { int64_t netId = system->nodes[NET].nodes[n].id; @@ -469,6 +484,14 @@ ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; // Flush is required on Ampere and earlier if (gpu->gpu.cudaCompCap >= 90) *flush = 0; + // On C2C platforms, data could go through a PCI switch while completions and + // flags would go through C2C. In that case, force a flush. + int c, n; + NCCLCHECK(ncclGetLocalCpu(system, g, &c)); + NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n)); + if (gpu->paths[NET][n].type <= PATH_PXB && gpu->paths[CPU][c].type == PATH_C2C) { + *flush = 1; + } return ncclSuccess; } @@ -538,7 +561,7 @@ NCCL_PARAM(PxnDisable, "PXN_DISABLE", 0); int ncclPxnDisable(struct ncclComm* comm) { static int pxnDisable = -1; if (pxnDisable == -1) { - if (comm && ncclNetVersion(comm) == 4) { + if (comm && comm->ncclNetVer == 4) { INFO(NCCL_INIT, "PXN Disabled as plugin is v4"); pxnDisable = 1; } else { @@ -561,9 +584,9 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int proxyRank; NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netId, NULL, &proxyRank)); if (proxyRank == comm->rank) continue; - int useGdr; + enum ncclTopoGdrMode useGdr; NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->rank, netId, 1, &useGdr)); - if (useGdr == 0) continue; + if (useGdr == ncclTopoGdrModeDisable) continue; int found = 0; for (int r=0; rpaths[NET][n].type < PATH_PHB) { // Update path when we dont want to / can't use GPU Direct RDMA. - int gdr; + enum ncclTopoGdrMode gdr; NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].gpu.rank, netNode->id, 0, &gdr)); if (gdr == 0) { // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU @@ -862,3 +885,38 @@ ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink *allNvLink = maxPath >= PATH_PIX ? 0 : 1; return ncclSuccess; } + +// Check whether we are in a split NVLink situation, with two NVLink domains, not +// connected through NVLink (e.g. QPI). +ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink) { + ncclResult_t res = ncclSuccess; + int nvlDomains = 0; + int *nvlDomain = NULL, *nvlDomainCount = NULL; + // Compute NVLink domains + NCCLCHECKGOTO(ncclCalloc(&nvlDomain, system->nodes[GPU].count), res, exit); + for (int g=0; gnodes[GPU].count; g++) nvlDomain[g] = g; + for (int g=0; gnodes[GPU].count; g++) { + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + int domain = nvlDomain[g]; + for (int p=g+1; pnodes[GPU].count; p++) { + if (gpu->paths[GPU][p].type == PATH_NVL) { + nvlDomain[p] = domain; + } + } + } + // Compute number of GPUs per NVLink domain. + NCCLCHECKGOTO(ncclCalloc(&nvlDomainCount, system->nodes[GPU].count), res, exit); + for (int g=0; gnodes[GPU].count; g++) { + nvlDomainCount[nvlDomain[g]]++; + } + // Count the number of NVLink domains + for (int g=0; gnodes[GPU].count; g++) { + if (nvlDomainCount[g] > 1) nvlDomains++; + } + *splitNvLink = nvlDomains == 2 ? 1 : 0; + +exit: + if(nvlDomain) free(nvlDomain); + if(nvlDomainCount) free(nvlDomainCount); + return res; +} diff --git a/src/graph/search.cc b/src/graph/search.cc index 0185b3f7b..15a01243f 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -446,12 +446,11 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop // 1. Select NETs starting with those close to GPU(s), based on paths[n].type. // 2. add other NETs satisfying typeInter but not already in the list. -ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) { +ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet) { ncclResult_t ret = ncclSuccess; int netCount = 0; int localNetCount; - int* localNets; - NCCLCHECK(ncclCalloc(&localNets, MAXCHANNELS)); + int localNets[MAXCHANNELS]; // First add the preferred NICs for (int g=0; gnodes[GPU].count; g++) { @@ -460,8 +459,8 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; for (int c = 0; cgpu.rank, c, &netId, NULL), ret, fail); - NCCLCHECKGOTO(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount), ret, fail); + NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL)); + NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount)); if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break; localNetCount++; } @@ -469,7 +468,7 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in for (int i=0; iintra[graph->nChannels*ngpus+step] = gpu->gpu.rank; int g = gpu - system->nodes[GPU].nodes; - int* nets = NULL; + int nets[NCCL_TOPO_MAX_NODES]; if (step == backToNet) { // first get back to NIC if (system->nodes[NET].count) { @@ -533,8 +527,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex)); struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex; int netCount; - NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); - NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount), ret, fail); + NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount)); for (int i=0; inodes[NET].nodes+n; @@ -555,14 +548,14 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo graph->bwInter /= 2; } - NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net), ret, fail); + NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net)); graph->bwInter = bwInterSave; if (net) { graph->inter[graph->nChannels*2+1] = net->id; - NCCLCHECKGOTO(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time), ret, fail); + NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time)); if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->bwInter /= 2; - NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net), ret, fail); + NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net)); graph->bwInter = bwInterSave; } } @@ -601,21 +594,15 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo // Next path NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time)); } -exit: - if (nets) free(nets); - return ret; -fail: - goto exit; + return ncclSuccess; } ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) { - ncclResult_t ret = ncclSuccess; const int bw = graph->bwInter; - int* nets; - NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); + int nets[NCCL_TOPO_MAX_NODES]; int netCount; int graphFound = 0; - NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount), ret, fail); + NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount)); for (int i=0; ipattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) break; int n = nets[(graph->nChannels+i)%netCount]; @@ -639,7 +626,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo // NVLS search only tries to find NIC:GPU combinations to compute the heads. if (graph->nChannels < netCount) { int gpu; - NCCLCHECKGOTO(ncclTopoGetLocalGpu(system, net->id, &gpu), ret, fail); + NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu)); if (gpu != -1) { int duplicate = 0; // check whether there is duplicate head when one GPU connects with multiple NICs @@ -650,7 +637,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo } } if (!duplicate) { - NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu), ret, fail); + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu)); graphFound = 1; } } @@ -659,14 +646,14 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo if (graph->nChannels > 0) { // Try to replay the last channel int g; - NCCLCHECKGOTO(ncclTopoReplayGetGpu(system, graph, -1, &g), ret, fail); - NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g), ret, fail); + NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g)); + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g)); } if (graph->nChannels == 0 || graph->sameChannels == 0) { if (graph->nChannels == 0 && system->nodes[NVS].count == 0) { // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long int t = 1 << 10; - NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0), ret, fail); + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0)); if (t == -1) *time = -1; } @@ -686,7 +673,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo for (int i=0; inodes[GPU].count; i++) { int g = (graph->nChannels+i)%system->nodes[GPU].count; if (paths[g].bw == maxBw && paths[g].count == minHops) { - NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g), ret, fail); + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g)); } } } @@ -700,11 +687,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo } } } -exit: - free(nets); - return ret; -fail: - goto exit; + return ncclSuccess; } /* Search Patterns @@ -999,6 +982,15 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph graph->minChannels = graph->maxChannels; } + int splitNvLink; + NCCLCHECK(ncclTopoSplitNvLink(system, &splitNvLink)); + if (graph->pattern == NCCL_TOPO_PATTERN_RING && splitNvLink) { + // We have two sockets with NVLink and a slower link in between (typically QPI). + // Tree is likely going to work better but it needs at least 2 channels. + // Since Tree needs to have the same number of channels as Ring, also force Ring to use 2 channels. + if (graph->maxChannels >= 2 && graph->minChannels == 1) graph->minChannels = 2; + } + struct ncclTopoGraph tmpGraph; memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph)); diff --git a/src/graph/topo.cc b/src/graph/topo.cc index ba82cafb7..9499f396d 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -22,8 +22,8 @@ #define BUSID_REDUCED_SIZE (sizeof("0000:00")) const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" }; -const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "PCI", "", "", "", "SYS", "NET" }; -const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" }; +const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "C2C", "PCI", "", "", "", "SYS", "NET" }; +const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" }; /******************************************************************/ /******************* Graph Creation Functions *********************/ @@ -45,7 +45,7 @@ ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) return ncclSuccess; } -static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu) { +static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu, struct ncclTopoNode* from) { *cpu = NULL; if (node->type == CPU) { *cpu = node; @@ -54,9 +54,10 @@ static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode* for (int l=0; lnlinks; l++) { // Go up the PCI tree to find the CPU. Follow only PCI switches. if (node->links[l].type == LINK_PCI + && node->links[l].remNode != from && (node->links[l].remNode->type == PCI || node->links[l].remNode->type == CPU)) { - NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu)); + NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu, node)); } if (*cpu != NULL) return ncclSuccess; } @@ -77,13 +78,17 @@ static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, float* bw) { return ncclSuccess; } if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { - *bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW; + *bw = + cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_ERP ? ERP_QPI_BW : + cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_SRP ? SRP_QPI_BW : + cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_SKL ? SKL_QPI_BW : + BDW_QPI_BW; } if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) { *bw = AMD_BW; } if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) { - *bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW; + *bw = cpu->cpu.model == NCCL_TOPO_CPU_MODEL_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW; } return ncclSuccess; } @@ -511,12 +516,16 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s int familyId, modelId; NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId)); NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId)); - cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW; + cpu->cpu.model = + (familyId == 6 && modelId >= 0xCF) ? NCCL_TOPO_CPU_MODEL_INTEL_ERP : + (familyId == 6 && modelId >= 0x8F) ? NCCL_TOPO_CPU_MODEL_INTEL_SRP : + (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_MODEL_INTEL_SKL : + NCCL_TOPO_CPU_MODEL_INTEL_BDW; } else if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) { int familyId, modelId; NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId)); NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId)); - if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_TYPE_YONGFENG; + if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_MODEL_YONGFENG; } } for (int s=0; snSubs; s++) { @@ -565,7 +574,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, NCCL_TOPO_ID(systemId, busId))); } else if (targetType == CPU) { // NVL connection to the local CPU - NCCLCHECK(findLocalCpu(gpu, &remote)); + NCCLCHECK(findLocalCpu(gpu, &remote, NULL)); } else { if (system->nodes[NVS].count == 0) { NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0)); @@ -642,10 +651,10 @@ ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* sys NCCLCHECK(xmlGetAttrInt(node, "bw", &bw)); double c2cBw = (bw*count)/1000.0; struct ncclTopoNode* cpu = NULL; - NCCLCHECK(findLocalCpu(gpu, &cpu)); + NCCLCHECK(findLocalCpu(gpu, &cpu, NULL)); if (cpu == NULL) return ncclSuccess; - NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_NVL, c2cBw)); - NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_NVL, c2cBw)); + NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_C2C, c2cBw)); + NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_C2C, c2cBw)); } else { if (strcmp(node->name, "cpu") == 0) { NCCLCHECK(ncclGetSystemId(system, node, &systemId)); @@ -961,26 +970,31 @@ struct ncclXmlNode** physNetNodes, struct ncclXmlNode** netNode, ncclResult_t (* // Trigger the merge, then get the new device's properties int vDevIndex = 0; ncclResult_t ret = makeVDevice(&vDevIndex, vProps); - if (ret == ncclInvalidUsage) { - WARN("TOPO/NET : Tried merging multiple devices together and failed. Try setting NCCL_NET_MERGE_LEVEL=LOC"); - NCCLCHECK(ret); + if (ret != ncclSuccess) { + INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "TOPO/NET : Tried merging multiple devices together and failed. vProps={ndevs=%d, devs=[%d %d %d %d]}. Set NCCL_NET_MERGE_LEVEL=LOC to disable NIC fusion.", + vProps->ndevs, vProps->devs[0], vProps->devs[1], vProps->devs[2], vProps->devs[3]); + return ret; } INFO(NCCL_GRAPH, "TOPO/NET : Made vNic %d", vDevIndex); return ncclSuccess; } -ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) { +ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, const char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) { + ncclResult_t ret = ncclSuccess; INFO(NCCL_ENV|NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str); + char* ncStr; + NCCLCHECK(ncclCalloc(&ncStr, strlen(str)+1)); + strcpy(ncStr, str); char* semi_token; - char* semi = strtok_r(str, ";", &semi_token); + char* semi = strtok_r(ncStr, ";", &semi_token); while (semi) { TRACE(NCCL_NET, "Fusing %s", semi); struct netIf userIfs[NCCL_NET_MAX_DEVS_PER_NIC]; int nUserIfs = parseStringList(semi, userIfs, NCCL_NET_MAX_DEVS_PER_NIC); if (nUserIfs == 0) { INFO(NCCL_NET, "NET/IB : Invalid NCCL_NET_FORCE_MERGE specified %s. Couldn't parse substring %s. Please provide a semicolon-delimited list of comma-delimited NIC groups.", - str, semi); + ncStr, semi); continue; } @@ -994,26 +1008,37 @@ ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, char* str, if (vProps.ndevs != nUserIfs) { WARN("TOPO/NET : Only matched %d devices, %d requested from %s", vProps.ndevs, nUserIfs, semi); - return ncclInvalidUsage; + ret = ncclInvalidUsage; + goto fail; } if (vProps.ndevs > NCCL_NET_MAX_DEVS_PER_NIC) { WARN("Specified fused NIC %s which has too many devices (%d). Max %d", semi, vProps.ndevs, NCCL_NET_MAX_DEVS_PER_NIC); - return ncclInvalidUsage; + ret = ncclInvalidUsage; + goto fail; } struct ncclXmlNode* netNode; - NCCLCHECK(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice)); - - // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this) - for (int i = 0; i < vProps.ndevs; i++) { - placedDevs[vProps.devs[i]] = 1; + ret = ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice); + if (ret == ncclSuccess) { + // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this) + for (int i = 0; i < vProps.ndevs; i++) { + placedDevs[vProps.devs[i]] = 1; + } + } else { + WARN("TOPO/NET : Could not force merge NICs %s. Please specify a valid NCCL_NET_FORCE_MERGE string.", semi); + ret = ncclInvalidUsage; + goto fail; } semi = strtok_r(NULL, ";", &semi_token);; } - return ncclSuccess; +exit: + free(ncStr); + return ret; +fail: + goto exit; } ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) { @@ -1061,7 +1086,24 @@ ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLe } struct ncclXmlNode* netNode; - NCCLCHECKGOTO(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice), res, out); + ncclResult_t ret = ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice); + + // Merging failed. + // Mark all as unplaced and increase their distance to disconnected (PATH_DIS) + // Set i to 0 to restart the automatic merging process and ensure all are placed + if (ret != ncclSuccess) { + INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "Marking physical devices as unplaced, increasing distance and restarting search."); + placedDevs[i] = 0; + TRACE(NCCL_GRAPH, "Setting dev %d as unplaced, keeping distance -> self as PATH_LOC", i); + for (int k = 1; k < vProps.ndevs; k++) { + int dev = vProps.devs[k]; + placedDevs[dev] = 0; + paths[i*nPhysDevs + dev] = PATH_DIS; + paths[dev*nPhysDevs + i] = PATH_DIS; + TRACE(NCCL_GRAPH, "Setting dev %d as unplaced, setting distance -> %d as PATH_DIS", dev, i); + } + i = 0; + } } } @@ -1125,16 +1167,16 @@ ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_ // By default, don't merge any devices int mergeLevel; mergeLevel = PATH_PORT; - char* mergeLevelEnv; - mergeLevelEnv = getenv("NCCL_NET_MERGE_LEVEL"); - if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList); - char* forceMerge; - forceMerge = getenv("NCCL_NET_FORCE_MERGE"); - NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs)); - memset(placedDevs, 0, sizeof(int)*physicalDevs); - - if (forceMerge) { - NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out); + { // Avoids warnings related to jumping to "out" + const char* mergeLevelEnv = ncclGetEnv("NCCL_NET_MERGE_LEVEL"); + if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList); + const char* forceMerge = ncclGetEnv("NCCL_NET_FORCE_MERGE"); + NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs)); + memset(placedDevs, 0, sizeof(int)*physicalDevs); + + if (forceMerge) { + NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out); + } } NCCLCHECKGOTO(ncclTopoAutoMerge(comm, xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out); diff --git a/src/graph/topo.h b/src/graph/topo.h index 2be029b88..921a7f5d6 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -18,9 +18,11 @@ #define SM86_NVLINK_BW 12.0 #define SM100_NVLINK_BW 40.0 #define PCI_BW 12.0 // PCI Gen3 x16 -#define QPI_BW 6.0 #define AMD_BW 16.0 +#define BDW_QPI_BW 6.0 #define SKL_QPI_BW 10.0 +#define SRP_QPI_BW 22.0 +#define ERP_QPI_BW 40.0 #define ZPI_BW 6.0 #define YONGFENG_ZPI_BW 9.0 #define P9_BW 32.0 @@ -44,12 +46,13 @@ extern const char* topoNodeTypeStr[]; #define LINK_LOC 0 #define LINK_NVL 1 // Skipping 2 for PATH_NVB -#define LINK_PCI 3 -// Skipping 4 for PATH_PXB -// Skipping 5 for PATH_PXN -// Skipping 6 for PATH_PHB -#define LINK_SYS 7 -#define LINK_NET 8 +#define LINK_C2C 3 +#define LINK_PCI 4 +// Skipping 5 for PATH_PXB +// Skipping 6 for PATH_PXN +// Skipping 7 for PATH_PHB +#define LINK_SYS 8 +#define LINK_NET 9 extern const char* topoLinkTypeStr[]; // Local (myself) @@ -61,29 +64,32 @@ extern const char* topoLinkTypeStr[]; // Connection through NVLink using an intermediate GPU #define PATH_NVB 2 +// Connection through C2C +#define PATH_C2C 3 + // Connection traversing at most a single PCIe bridge -#define PATH_PIX 3 +#define PATH_PIX 4 // Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) -#define PATH_PXB 4 +#define PATH_PXB 5 // Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations. -#define PATH_PXN 5 +#define PATH_PXN 6 // Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) -#define PATH_PHB 6 +#define PATH_PHB 7 // Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) -#define PATH_SYS 7 +#define PATH_SYS 8 // Connection through the network -#define PATH_NET 8 +#define PATH_NET 9 // New type of path which should precede PATH_PIX #define PATH_PORT PATH_NVL // Disconnected -#define PATH_DIS 9 +#define PATH_DIS 10 extern const char* topoPathTypeStr[]; struct ncclTopoNode; @@ -103,9 +109,6 @@ struct ncclTopoLinkList { int type; }; -#define NCCL_TOPO_CPU_INTEL_BDW 1 -#define NCCL_TOPO_CPU_INTEL_SKL 2 - #define NCCL_TOPO_UNDEF (-1) #define NCCL_TOPO_ID_LOCAL_ID_MASK 0x00ffffffffffffff @@ -176,6 +179,7 @@ ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank); ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min); ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max); +ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink); #define NCCL_TOPO_XML_MAX_NODES 256 #define NCCL_GRAPH_XML_MAX_NODES 4096 diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index 8da4aeb9e..68085b893 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -177,6 +177,7 @@ static const double perChMaxTreeBws[][3] = { NCCL_PARAM(PatEnable, "PAT_ENABLE", 2); static int ncclPatEnable(struct ncclComm* comm) { int patEnable = ncclParamPatEnable(); + if (comm->minCompCap < 60) return 0; // Need SM60 or higher for CUDA atomics if (patEnable != 2) return patEnable; if (comm->nNodes != comm->nRanks) return 0; // PAT only supports 1 GPU per node if (comm->netDeviceType != NCCL_NET_DEVICE_HOST) return 0; // PAT doesn't support net device offload @@ -257,7 +258,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom if (a == NCCL_ALGO_TREE && coll == ncclFuncAllReduce) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw); if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw); if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw); - if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85; + if (a == NCCL_ALGO_TREE && comm->maxTreePattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85; if (a == NCCL_ALGO_PAT) busBw *= .75; if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used diff --git a/src/group.cc b/src/group.cc index e387db70c..c48c0de88 100644 --- a/src/group.cc +++ b/src/group.cc @@ -193,7 +193,6 @@ ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) { static ncclResult_t doLaunches(struct ncclComm* head) { ncclResult_t result = ncclSuccess; - struct ncclComm* cliqueComm0 = head->intraComm0; struct ncclComm* cliqueHead = head; struct ncclComm* cliqueNextHead; bool useBarrier = ncclParamLaunchMode == ncclLaunchModeGroup; @@ -209,7 +208,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) { NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure); if (useBarrier) ncclCommIntraBarrierIn(comm, 1); comm = comm->groupNext; - } while (comm != nullptr && comm->intraComm0 == cliqueComm0); + } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0); cliqueNextHead = comm; if (capturingYes && capturingNo) { @@ -424,38 +423,47 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf /* Connect channels at runtime if cumem is supported */ if (groupCommHeadMain != nullptr) { - struct ncclComm* comm = groupCommHeadMain; + struct ncclComm* cliqueHead = groupCommHeadMain; + struct ncclComm* comm = NULL; struct ncclIntruQueue asyncCollJobs; ncclIntruQueueConstruct(&asyncCollJobs); do { - bool needConnect = false; - bool algoNeedConnect[NCCL_NUM_ALGORITHMS]; - memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS); - - CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail); - NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail); - - if (comm->cuMemSupport && needConnect) { - struct ncclPreconnectJob* job; - NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail); - job->base.func = ncclCollPreconnectFunc; - job->base.undo = nullptr; - job->base.destructor = free; - job->base.state = ncclGroupJobRunning; - job->base.abortFlag = comm->abortFlag; - job->base.abortFlagDev = comm->abortFlagDev; - job->comm = comm; - NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail); - memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS); - ncclIntruQueueEnqueue(&asyncCollJobs, &job->base); + // We need to preconnect connections for collectives clique by clique to avoid + // race condition for split shared comms which can connect the same connections + // at the same time. + comm = cliqueHead; + do { + bool needConnect = false; + bool algoNeedConnect[NCCL_NUM_ALGORITHMS]; + memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS); + + CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail); + NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail); + + if (comm->cuMemSupport && needConnect) { + struct ncclPreconnectJob* job; + NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail); + job->base.func = ncclCollPreconnectFunc; + job->base.undo = nullptr; + job->base.destructor = free; + job->base.state = ncclGroupJobRunning; + job->base.abortFlag = comm->abortFlag; + job->base.abortFlagDev = comm->abortFlagDev; + job->comm = comm; + NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail); + memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS); + ncclIntruQueueEnqueue(&asyncCollJobs, &job->base); + } + comm = comm->groupNext; + } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0); + // connect + NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail); + while (!ncclIntruQueueEmpty(&asyncCollJobs)) { + struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs); + if (job->destructor) job->destructor((void*)job); } - comm = comm->groupNext; - } while (comm); - NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail); - while (!ncclIntruQueueEmpty(&asyncCollJobs)) { - struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs); - if (job->destructor) job->destructor((void*)job); - } + cliqueHead = comm; + } while (cliqueHead != nullptr); // done with all buffer allocation, start registration and enqueue comm = groupCommHeadMain; diff --git a/src/include/bitops.h b/src/include/bitops.h index a650aa7f4..dcf0e2e09 100644 --- a/src/include/bitops.h +++ b/src/include/bitops.h @@ -8,6 +8,7 @@ #define NCCL_BITOPS_H_ #include +#include #if !__NVCC__ #ifndef __host__ @@ -276,13 +277,53 @@ inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) { return u32fpDecode(x, 3); } -inline __host__ __device__ uint64_t getHash(const char* string, int n) { - // Based on DJB2a, result = result * 33 ^ char - uint64_t result = 5381; - for (int c = 0; c < n; c++) { - result = ((result << 5) + result) ^ string[c]; +// The hash isn't just a function of the bytes but also where the bytes are split +// into different calls to eatHash(). +inline __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size_t size) { + char const* ptr = (char const*)bytes; + acc[0] ^= size; + while (size != 0) { + // Mix the accumulator bits. + acc[0] += acc[1]; + acc[1] ^= acc[0]; + acc[0] ^= acc[0] >> 31; + acc[0] *= 0x9de62bbc8cef3ce3; + acc[1] ^= acc[1] >> 32; + acc[1] *= 0x485cd6311b599e79; + // Read in a chunk of input. + size_t chunkSize = size < sizeof(uint64_t) ? size : sizeof(uint64_t); + uint64_t x = 0; + memcpy(&x, ptr, chunkSize); + ptr += chunkSize; + size -= chunkSize; + // Add to accumulator. + acc[0] += x; } - return result; +} + +template +inline __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) { + eatHash(acc, (const void*)bytes, sizeof(T)); +} + +inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) { + uint64_t h = acc[0]; + h ^= h >> 31; + h *= 0xbac3bd562846de6b; + h += acc[1]; + h ^= h >> 32; + h *= 0x995a187a14e7b445; + return h; +} + +inline __host__ __device__ uint64_t getHash(const void* bytes, size_t size) { + uint64_t acc[2] = {1, 1}; + eatHash(acc, bytes, size); + return digestHash(acc); +} +template +inline __host__ __device__ uint64_t getHash(const T* bytes) { + return getHash((const void*)bytes, sizeof(T)); } #endif diff --git a/src/include/collectives.h b/src/include/collectives.h index c82ebce6f..c68b0418c 100644 --- a/src/include/collectives.h +++ b/src/include/collectives.h @@ -10,6 +10,7 @@ #include "nccl.h" #include "nccl_common.h" #include "device.h" + #define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two. // CHUNKSIZE must be a multiple of SLICESIZE @@ -382,6 +383,42 @@ class RingBCAlgorithm : public RingAlgorithm { ~RingBCAlgorithm() {} }; +#if !defined (__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 +#include +#endif + +// Need a power of two to ensure it divides by parallelFactor (which is also a power of two) +#define NCCL_PAT_NWORKERS 512 + +static constexpr int PatUsed = 0x1, + PatSkipped = 0x2; + +struct ncclPatStep { + int recvDim, sendDim, recvOffset, sendOffset, stepOffset, postRecv, postSend, nelem, last, flags; + size_t inpIx, outIx; +}; + +struct ncclPatPeer { + uint64_t step; + struct ncclConnInfo* conn; + struct ncclConnFifo* connFifo; + void* buff; + uint64_t *headPtr; + uint64_t *tailPtr; + uint64_t stepCache; + long long int accSize; + int connStepSize; +}; + +#define NCCL_SHMEM_PAT_STEPS 32 +struct ncclPatShmem { + struct ncclPatStep patSteps[NCCL_SHMEM_PAT_STEPS]; + int parallelFactor; + long long int localAccSize; + struct ncclPatPeer sendDims[32]; // Should cover 2^32 ranks + struct ncclPatPeer recvDims[32]; +}; + template class PatRSAlgorithm{ size_t offset; @@ -394,18 +431,17 @@ class PatRSAlgorithm{ int nrPow2; int postFreq; int lastA; - + int parallelFactor; int aggFactor; int as; // aggregated steps int a; // step inside aggregated step int sendSkipped; // number of skipped steps during aggregation - int recvSkipped; // number of skipped steps during aggregation - int phase2recv; // receive offset for phase 2 + int stepOffset; int aggDelta; int scale; int phase; - __device__ __host__ int min(int a, int b) { + __device__ __host__ ssize_t min(ssize_t a, ssize_t b) { return (a= 2) lastA /= 2*scale; + if (phase == 4) lastA = 1; } __device__ __host__ void reset() { nelem = getNelem(); phase = 0; scale = 1; - phase2recv = 0; as = aggDelta - 1; resetA(); } @@ -465,8 +501,9 @@ class PatRSAlgorithm{ } public: - __device__ __host__ PatRSAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks): + __device__ __host__ PatRSAlgorithm(int stepSize, int stepDepth, int maxParallelFactor, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks): offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) { + parallelFactor = maxParallelFactor; aggDelta = nrPow2 = (1< 1 && aggFactor < nranks/2) { d /= 2; @@ -486,160 +524,151 @@ class PatRSAlgorithm{ reset(); } - __device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &sendStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) { -restart: - last = 0; - nelemOut = nelem; - outIx = offset; + __device__ __host__ int getParallelFactor() { + return parallelFactor; + } + + __device__ __host__ void getNextOp(struct ncclPatStep* ps) { + ps->last = 0; + ps->nelem = nelem; + ps->outIx = offset; + ps->stepOffset = stepOffset; int skip = 0; - //printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale); - if (phase == 0) { + if (a >= lastA) { + skip = 1; + } else if (phase == 0) { int s = mirrorInvert(a, lastA)*aggDelta + as; if (s >= nranks) skip = 1; int sendDataRank = (rank + s) % nranks; - inpIx = sendDataRank * count + offset; - recvDim = -1; - sendDim = 0; - outIx = 0; - recvOffset = -1; - sendOffset = ((a - sendSkipped)%postFreq) * nelem; - sendStepOffset = 0; - if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) { - postSend = 1; + ps->inpIx = sendDataRank * count + offset; + ps->recvDim = -1; + ps->sendDim = 0; + ps->outIx = 0; + ps->recvOffset = -1; + ps->sendOffset = (a%postFreq) * nelem; + if (((a%postFreq) + 1 >= postFreq) || (a == lastA-1)) { + ps->postSend = 1; } else { - postSend = 0; - } - postRecv = 0; - if (skip) sendSkipped++; - if (++a == lastA) { - phase = as == 1 ? (aggFactor > 1 ? 2 : 4) : 1; // If as == 1, switch to phase 2 - resetA(); + ps->postSend = 0; } - if (skip == 0) return; + ps->postRecv = 0; } else if (phase == 1) { int s = mirrorInvert(a, lastA)*aggDelta + as; if (s >= nranks) skip = 1; - recvDim = firstBitSet(s, nrPow2); - sendOffset = ((a - sendSkipped)%postFreq)*nelem; - recvOffset = ((a - recvSkipped)%postFreq)*nelem; - postSend = 0; - if (recvDim == 0) { - if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) postSend = 1; - sendStepOffset = 0; + ps->recvDim = firstBitSet(s, nrPow2); + ps->sendOffset = (a%postFreq)*nelem; + ps->recvOffset = (a%postFreq)*nelem; + ps->postSend = 0; + if (ps->recvDim == 0 && (((a%postFreq) + 1 >= postFreq) || (a == lastA-1))) ps->postSend = 1; + if (((a%postFreq) + 1 >= postFreq) || (a == lastA-1)) { + ps->postRecv = 1; } else { - sendStepOffset = (a - sendSkipped)/postFreq; + ps->postRecv = 0; } - if ((((a - recvSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) { - postRecv = 1; - } else { - postRecv = 0; - } - s -= (1<recvDim); int recvDataRank = (rank + nranks + s) % nranks; - inpIx = recvDataRank * count + offset; - sendDim = s ? firstBitSet(s, nrPow2) : -1; - if (sendDim == -1) { - sendOffset = -1; - sendStepOffset = 0; - } else if (as - (1<inpIx = recvDataRank * count + offset; + ps->sendDim = s ? firstBitSet(s, nrPow2) : -1; + if (ps->sendDim == -1) { + ps->sendOffset = -1; + } else if (as - (1<recvDim) == 0) { + if (newPeer(a, aggFactor)) { sendSkipped = a; ps->stepOffset = stepOffset = 0; } int foffset = a - sendSkipped; - sendStepOffset = recvDim == 0 ? 0 : foffset/postFreq; - sendOffset = (foffset%postFreq)*nelem; + ps->sendOffset = (foffset%postFreq)*nelem; } + int recvDim = ps->recvDim; if (s < nranks && skip) { - recvDim = -1; - recvOffset = -1; - postRecv = 0; + ps->recvDim = -1; + ps->recvOffset = -1; + ps->postRecv = 0; skip = 0; } - if (skip || recvDim == -1) recvSkipped++; - if (skip) sendSkipped++; - if (++a == lastA) { - as--; - phase = as % 2 == 1 ? 0 : 1; - resetA(); - } - if (skip == 0) return; + if (recvDim > 0 && (((a-sendSkipped)%postFreq) + 1 >= postFreq) && skip == 0) stepOffset++; } else if (phase == 2) { int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta + 1; - postRecv = 0; + ps->postRecv = 0; if (s >= nranks) skip = 1; - recvDim = 0; - postSend = a == lastA-1 ? 1 : 0; + ps->recvDim = 0; + ps->postSend = a == lastA-1 ? 1 : 0; s -= 1; if (s < nranks && skip) { - recvDim = -1; - recvOffset = -1; + ps->recvDim = -1; + ps->recvOffset = -1; skip = 0; } else if (!skip) { - int foffset = phase2recv; - phase2recv++; - postRecv |= ((foffset+1)%postFreq) == 0 ? 1 : 0; - recvOffset = (foffset%postFreq) * nelem; + int foffset = a + aggFactor - aggFactor/scale; + ps->postRecv |= ((foffset+1)%postFreq) == 0 ? 1 : 0; + ps->recvOffset = (foffset%postFreq) * nelem; } int recvDataRank = (rank + nranks + s) % nranks; - inpIx = recvDataRank * count + offset; - sendDim = s ? firstBitSet(s, nrPow2) : -1; - int foffset = a - sendSkipped; - postSend |= ((foffset+1)%postFreq) == 0 ? 1 : 0; - sendStepOffset = 0; - sendOffset = (foffset%postFreq) * nelem; - if (skip || sendDim == -1) sendSkipped++; - if (++a == lastA) { - phase = 3; - resetA(); - } - if (skip == 0) return; + ps->inpIx = recvDataRank * count + offset; + ps->sendDim = s ? firstBitSet(s, nrPow2) : -1; + int foffset = a; + ps->postSend |= ((foffset+1)%postFreq) == 0 ? 1 : 0; + ps->sendOffset = (foffset%postFreq) * nelem; } else if (phase == 3) { int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta; - postRecv = a == lastA-1 ? 1 : 0; + ps->postRecv = a == lastA-1 ? 1 : 0; if (s >= nranks) skip = 1; - recvDim = firstBitSet(s, nrPow2); - postSend = 0; - s -= (1<recvDim = firstBitSet(s, nrPow2); + ps->postSend = 0; + s -= (1<recvDim); + int foffset = a; + ps->postRecv |= (foffset+1)%postFreq == 0 ? 1 : 0; + ps->recvOffset = (foffset%postFreq) * nelem; int recvDataRank = (rank + nranks + s) % nranks; - inpIx = recvDataRank * count + offset; - sendDim = s ? firstBitSet(s, nrPow2) : -1; + ps->inpIx = recvDataRank * count + offset; + ps->sendDim = s ? firstBitSet(s, nrPow2) : -1; if (s < nranks && skip) { - recvDim = -1; - recvOffset = -1; - postRecv = 0; + ps->recvDim = -1; + ps->recvOffset = -1; + ps->postRecv = 0; skip = 0; } - if (newPeer(a, aggFactor/(2*scale))) sendSkipped = a; + if (newPeer(a, aggFactor/(2*scale))) { sendSkipped = a; ps->stepOffset = stepOffset = 0; } foffset = a - sendSkipped; - sendStepOffset = foffset / postFreq; // Accumulate on next steps - sendOffset = sendDim >= 0 ? (foffset%postFreq) * nelem : -1; - if (skip || recvDim == -1) recvSkipped++; - if (skip) sendSkipped++; - if (++a == lastA) { - scale *= 2; - phase = scale < aggFactor ? 2 : 4; - resetA(); - } - if (skip == 0) return; + if ((foffset%postFreq) + 1 >= postFreq && skip == 0) stepOffset++; + ps->sendOffset = ps->sendDim >= 0 ? (foffset%postFreq) * nelem : -1; } else if (phase == 4) { - recvDim = 0; - sendDim = -1; - inpIx = rank * count + offset; - recvOffset = (phase2recv%postFreq) * nelem; - sendStepOffset = 0; - sendOffset = -1; - postRecv = 1; - postSend = 0; + ps->recvDim = 0; + ps->sendDim = -1; + ps->inpIx = rank * count + offset; + ps->recvOffset = ((aggFactor-1)%postFreq) * nelem; + ps->sendOffset = -1; + ps->postRecv = 1; + ps->postSend = 0; offset += chunkCount; - if (offset >= end) { - last = 1; + } + a++; + if (a >= lastA && a >= parallelFactor) { + int p = phase; + if (p == 1) as--; + if (p == 3) scale *= 2; + phase = + p == 0 ? as == 1 ? (aggFactor > 1 ? 2 : 4) : 1 : + p == 1 ? as % 2 == 1 ? 0 : 1 : + p == 2 ? 3 : + p == 3 ? scale < aggFactor ? 2 : 4 : + 5; + if (p == 4) { + if (offset >= end) { + ps->last = 2; + } else { + reset(); + } } else { - reset(); + resetA(); } - return; + } else if (phase == 4 && offset >= end) { + ps->last = 1; } - goto restart; + int flags = PatUsed | (skip ? PatSkipped : 0); +#if __CUDA_ARCH__ >= 600 + cuda::atomic_ref a(ps->flags); + a.store(flags, cuda::memory_order_release); +#else + ps->flags = flags; +#endif } }; @@ -655,14 +684,12 @@ class PatAGAlgorithm{ int nrPow2; int postFreq; int lastA; - + int parallelFactor; int aggFactor; int as; // aggregated steps int a; // step inside aggregated step int aggDelta; - int scale; - int phase; // AS computation @@ -671,7 +698,7 @@ class PatAGAlgorithm{ int bitCount[32]; int bitZeroStep[32]; - __device__ __host__ int min(int a, int b) { + __device__ __host__ ssize_t min(ssize_t a, ssize_t b) { return (a 1 && aggFactor < nranks/2) { d /= 2; aggFactor *= 2; aggDelta /= 2; } - //printf("AggFactor %d PostFreq %d AggDelta %d\n", aggFactor, postFreq, aggDelta); asDim = log2Up(aggDelta); reset(); } - __device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &recvStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) { -restart: - //printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale); - last = 0; - nelemOut = nelem; - inpIx = offset; + __device__ __host__ int getParallelFactor() { + return parallelFactor; + } + + __device__ __host__ void getNextOp(struct ncclPatStep* ps) { + ps->last = 0; + ps->nelem = nelem; + ps->inpIx = offset; int skip = 0; - if (phase == 0) { + if (a >= lastA) { + skip = 1; + } else if (phase == 0) { int s = a*aggDelta + as; if (s >= nranks) skip = 1; - int nextSkip = (a+1)*aggDelta + as >= nranks ? 1 : 0; int recvDataRank = (rank + s) % nranks; - outIx = recvDataRank * count + offset; - sendDim = -1; - recvDim = 0; - inpIx = 0; - sendOffset = -1; - recvOffset = (a % postFreq) * nelem; - recvStepOffset = 0; - postRecv = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0; - postSend = 0; - a++; - if (nextSkip) { - as = nextAs(); - if (as == aggDelta/2) { - offset += chunkCount; - if (offset >= end) { - last = 1; - } else { - reset(); - } - return; - } - phase = 1; - resetA(); - } - if (skip == 0) return; + ps->outIx = recvDataRank * count + offset; + ps->sendDim = -1; + ps->recvDim = 0; + ps->inpIx = 0; + ps->sendOffset = -1; + ps->recvOffset = (a % postFreq) * nelem; + ps->stepOffset = 0; + ps->postRecv = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0; + ps->postSend = 0; } else if (phase == 1) { int s = a*aggDelta + as; if (s >= nranks) skip = 1; - sendDim = firstBitSet(s, nrPow2); - s -= (1<sendDim = firstBitSet(s, nrPow2); + s -= (1<sendDim); int sendDataRank = (rank + nranks + s) % nranks; - outIx = sendDataRank * count + offset; - recvDim = s ? firstBitSet(s, nrPow2) : -1; - sendOffset = recvOffset = (a % postFreq) * nelem; - postSend = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0; - postRecv = (sendDim == 0) && ((a % postFreq == postFreq-1) || ((a+1)*aggDelta+as-1 >= nranks)) ? 1 : 0; - recvStepOffset = (sendDim == 0) ? 0 : a/postFreq; - if (recvDim == -1) { - recvOffset = -1; - postRecv = 0; - } else if (as - (1<> (recvDim+1); - recvOffset = (foffset%postFreq)*nelem; - postRecv = (sendDim == 0) && ((foffset % postFreq == postFreq-1) || ((((foffset+1)*2)+1)<= nranks) ? 1 : 0; - recvStepOffset = (sendDim == 0) ? 0 : foffset/postFreq; + ps->outIx = sendDataRank * count + offset; + ps->recvDim = s ? firstBitSet(s, nrPow2) : -1; + ps->sendOffset = ps->recvOffset = (a % postFreq) * nelem; + ps->postSend = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0; + ps->postRecv = (ps->sendDim == 0) && ((a % postFreq == postFreq-1) || ((a+1)*aggDelta+as-1 >= nranks)) ? 1 : 0; + ps->stepOffset = (ps->sendDim == 0) ? 0 : a/postFreq; + if (ps->recvDim == -1) { + ps->recvOffset = -1; + ps->postRecv = 0; + } else if (as - (1<sendDim) == 0) { + int foffset = (a*aggDelta) >> (ps->recvDim+1); + ps->recvOffset = (foffset%postFreq)*nelem; + ps->postRecv = (ps->sendDim == 0) && ((foffset % postFreq == postFreq-1) || ((((foffset+1)*2)+1)<recvDim) >= nranks) ? 1 : 0; + ps->stepOffset = (ps->sendDim == 0) ? 0 : foffset/postFreq; } - if (s < nranks && sendDim == 0 && skip) { + if (s < nranks && ps->sendDim == 0 && skip) { // Don't forget to receive at least once even if we don't send afterwards - sendDim = -1; - sendOffset = -1; - postSend = 0; + ps->sendDim = -1; + ps->sendOffset = -1; + ps->postSend = 0; skip = 0; } - if (++a == lastA) { - if (as % 2 == 1) { - phase = 0; - } else { - as = nextAs(); - } - resetA(); - } - if (skip == 0) return; } else if (phase == 2) { int s = (2*a+1)*scale*aggDelta; - postSend = (a % postFreq == postFreq-1) || ((2*(a+1)+1)*scale*aggDelta >= nranks) ? 1 : 0; - postRecv = 0; + ps->postSend = (a % postFreq == postFreq-1) || ((2*(a+1)+1)*scale*aggDelta >= nranks) ? 1 : 0; + ps->postRecv = 0; if (s >= nranks) skip = 1; - sendDim = firstBitSet(s, nrPow2); - s -= (1<sendDim = firstBitSet(s, nrPow2); + s -= (1<sendDim); + ps->sendOffset = (a%postFreq) * nelem; + ps->stepOffset = a / postFreq; int sendDataRank = (rank + nranks + s) % nranks; - outIx = sendDataRank * count + offset; - recvDim = s ? firstBitSet(s, nrPow2) : -1; - if (recvDim == -1) { - recvOffset = -1; + ps->outIx = sendDataRank * count + offset; + ps->recvDim = s ? firstBitSet(s, nrPow2) : -1; + if (ps->recvDim == -1) { + ps->recvOffset = -1; } else { - s -= (1<> (recvDim+1); - recvOffset = (foffset%postFreq)*nelem; - recvStepOffset = foffset / postFreq; + s -= (1<recvDim); + int foffset = (a*2*scale*aggDelta) >> (ps->recvDim+1); + ps->recvOffset = (foffset%postFreq)*nelem; + ps->stepOffset = foffset / postFreq; } - if (++a == lastA) { - scale /= 2; - phase = scale ? 2 : 1; + } + a++; + if (a >= lastA && a >= parallelFactor) { + int p = phase; + if (p == 2) scale /= 2; + phase = + p == 2 ? scale ? 2 : 1 : + p == 1 ? as % 2 == 1 ? 0 : 1 : + 1; + if (p == 0 || (p == 1 && as % 2 == 0)) as = nextAs(); + if (p == 0 && as == aggDelta/2) { + offset += chunkCount; + if (offset >= end) { + ps->last = 2; + } else { + reset(); + } + } else { resetA(); } - if (skip == 0) return; + } else if (phase == 0 && as == 1 && offset + chunkCount >= end && a-1 >= ((lastA-1) / parallelFactor) * parallelFactor) { + ps->last = 1; } - goto restart; + int flags = PatUsed | (skip ? PatSkipped : 0); +#if __CUDA_ARCH__ >= 600 + cuda::atomic_ref a(ps->flags); + a.store(flags, cuda::memory_order_release); +#else + ps->flags = flags; +#endif } }; #endif diff --git a/src/include/comm.h b/src/include/comm.h index c3f4eb49f..409518713 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -131,6 +131,9 @@ struct ncclSharedResources { int* tpRankToLocalRank; // Internal streams struct ncclStrongStream deviceStream, hostStream; + int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream + int persistentRefs; + cudaEvent_t launchEvent, scratchEvent; /* proxy related shared res */ struct ncclProxyState* proxyState; @@ -407,6 +410,7 @@ struct ncclComm { // List of destructors to run when comm is destructed struct ncclDestructor* destructorHead; + struct ncclCudaContext* context; struct ncclSharedResources* sharedRes; /* map to top parent ranks. */ int* topParentRanks; @@ -419,6 +423,7 @@ struct ncclComm { int netPluginLoaded; ncclNet_t* ncclNet; + int ncclNetVer; ncclNetDeviceType netDeviceType; ncclCollNet_t* ncclCollNet; void* bootstrap; @@ -426,6 +431,7 @@ struct ncclComm { uint64_t* connectSend; uint64_t* connectRecv; struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS]; + int maxTreePattern; bool initAlgoChannels[NCCL_NUM_ALGORITHMS]; bool runtimeConn; // if dynamic connection is supported bool directMode; @@ -565,8 +571,7 @@ struct ncclComm { struct ncclComm* groupNext; // Subset of those in groupNext list. Holds 0x1 if not needing preconnect. struct ncclComm* preconnectNext; - int persistentRefs; // number of persistent plan-lists capturing this comm - int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream + int localPersistentRefs; // number of persistent plan-lists capturing this comm struct P2pSchedulePair { int sendRank; int recvRank; } *p2pSchedule; struct ncclKernelPlanner planner; @@ -603,6 +608,7 @@ struct ncclComm { // Profiler plugin void* profilerContext; uint64_t seqNumber[NCCL_NUM_FUNCTIONS]; + struct ncclProfilerProxy profiler; // buffer registration cache struct ncclRegCache regCache; diff --git a/src/include/device.h b/src/include/device.h index 3f918ab23..0763a579a 100644 --- a/src/include/device.h +++ b/src/include/device.h @@ -133,6 +133,7 @@ struct ncclProxyConnector { struct ncclConnector { int connected; + int hasSeen; struct ncclProxyConnector proxyConn; struct ncclTransportComm* transportComm; void* transportResources; @@ -374,6 +375,7 @@ struct alignas(16) ncclDevChannel { struct ncclDirect collnetDirect; struct ncclNvls nvls; uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed + uint64_t workCounter; }; struct ncclDevComm { @@ -396,6 +398,10 @@ struct ncclDevComm { // Channels, device side struct ncclDevChannel* channels/*[MAXCHANNELS]*/; int* rankToLocalRank; + + // Profiler counters + uint64_t* workStarted/*[MAXCHANNELS]*/; + uint64_t* workCompleted/*[MAXCHANNELS]*/; }; struct alignas(16) ncclDevCommAndChannels { @@ -468,7 +474,7 @@ __host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int __host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) { // Our collective unroll should move to the same bytes&insns model as NVLS. - return cudaArch >= 800 ? 8 : 4; + return cudaArch >= 800 ? (cudaArch == 1200 ? 6 : 8) : 4; } __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } diff --git a/src/include/graph.h b/src/include/graph.h index a22b62bb2..b779773da 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -36,7 +36,13 @@ ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm); ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank); ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank); ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret); -ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, int* useGdr); +enum ncclTopoGdrMode { + ncclTopoGdrModeDisable = 0, + ncclTopoGdrModeDefault = 1, + ncclTopoGdrModePci = 2, + ncclTopoGdrModeNum = 3 +}; +ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode); ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush); ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail); ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net); @@ -55,9 +61,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu #define NCCL_TOPO_CPU_VENDOR_AMD 2 #define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3 #define NCCL_TOPO_CPU_VENDOR_MIXED 4 -#define NCCL_TOPO_CPU_TYPE_BDW 1 -#define NCCL_TOPO_CPU_TYPE_SKL 2 -#define NCCL_TOPO_CPU_TYPE_YONGFENG 1 +#define NCCL_TOPO_CPU_MODEL_INTEL_BDW 1 +#define NCCL_TOPO_CPU_MODEL_INTEL_SKL 2 +#define NCCL_TOPO_CPU_MODEL_INTEL_SRP 3 +#define NCCL_TOPO_CPU_MODEL_INTEL_ERP 4 +#define NCCL_TOPO_CPU_MODEL_YONGFENG 1 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model); ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count); ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count); diff --git a/src/include/group.h b/src/include/group.h index 91bc19068..c06d1ef1b 100644 --- a/src/include/group.h +++ b/src/include/group.h @@ -112,6 +112,12 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) { struct ncclComm** pp = &ncclGroupCommHead; while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0) pp = &(*pp)->groupNext; + + // didn't find its clique, we need to insert it with ascending order based on commHash + if (*pp == nullptr) { + pp = &ncclGroupCommHead; + while (*pp != nullptr && (*pp)->commHash < comm->commHash) pp = &(*pp)->groupNext; + } comm->groupNext = *pp; *pp = comm; // Comms gets a new memory stack scope upon joining. Each task batched for diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h deleted file mode 100644 index f165aa1bf..000000000 --- a/src/include/nccl_net.h +++ /dev/null @@ -1,604 +0,0 @@ -/************************************************************************* - * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_NET_H_ -#define NCCL_NET_H_ - -#include "nccl.h" -#include "nccl_common.h" -#include "net_device.h" -#include - -#define NCCL_NET_HANDLE_MAXSIZE 128 -//Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties -#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) -#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1 - -#define NCCL_PTR_HOST 0x1 -#define NCCL_PTR_CUDA 0x2 -#define NCCL_PTR_DMABUF 0x4 - -// Maximum number of requests per comm object -#define NCCL_NET_MAX_REQUESTS 32 - -// Max number of ncclNet objects which can live in the same process -#define NCCL_NET_MAX_PLUGINS 3 - -#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4 -#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9 - -typedef struct { - int ndevs; - int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9]; -} ncclNetVDeviceProps_v9_t; -typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t; - -typedef struct { - char* name; // Used mostly for logging. - char* pciPath; // Path to the PCI device in /sys. - uint64_t guid; // Unique identifier for the NIC chip. Important for - // cards with multiple PCI functions (Physical or virtual). - int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] - int regIsGlobal; // regMr is not tied to a particular comm - int forceFlush; // Force a flush on receives - int speed; // Port speed in Mbps. - int port; // Port number. - float latency; // Network latency - int maxComms; // Maximum number of comms we can create - int maxRecvs; // Maximum number of grouped receives. - ncclNetDeviceType netDeviceType; // Network offload type - int netDeviceVersion; // Version number for network offload - ncclNetVDeviceProps_v9_t vProps; - size_t maxP2pBytes; // Max transfer size for point-to-point operations - size_t maxCollBytes; // Max transfer size for collective operations -} ncclNetProperties_v9_t; -typedef ncclNetProperties_v9_t ncclNetProperties_t; - -typedef struct { - // Name of the network (mainly for logs) - const char* name; - // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create a connection. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Connect to a handle and return a sending comm object for that peer. - // This call must not block for the connection to be established, and instead - // should return successfully with sendComm == NULL with the expectation that - // it will be called again until sendComm != NULL. - // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection - ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm); - // Finalize connection establishment after remote peer has called connect. - // This call must not block for the connection to be established, and instead - // should return successfully with recvComm == NULL with the expectation that - // it will be called again until recvComm != NULL. - // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection - ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm); - // Register/Deregister memory. Comm can be either a sendComm or a recvComm. - // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* comm, void* mhandle); - // Asynchronous send to a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request); - // Asynchronous recv from a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* sizes); - // Close and free send/recv comm objects - ncclResult_t (*closeSend)(void* sendComm); - ncclResult_t (*closeRecv)(void* recvComm); - ncclResult_t (*closeListen)(void* listenComm); - - // Copy the given mhandle to a dptr in a format usable by this plugin's device code - ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); - - // Notify the plugin that a recv has completed by the device - ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); - - // Create a virtual NIC given the specified properties, which can be accessed at device index d - ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props); -} ncclNet_v9_t; - -typedef ncclNet_v9_t ncclNet_t; - -#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v9 - -typedef struct { - void* mhandle; - void* address; - size_t size; -} ncclNetSGE_v9_t; - -typedef struct { - // Name of the collective network (mainly for logs) - const char* name; - // Initialize the collective network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters capable of doing collective operations. - // If ndev returns 0, all other functions might be set to NULL. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create connections. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Create a group for collective operations. handles have been created - // using listen() above. rank indicates caller's rank in the collective network. - ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); - // Returns whether a reduction operation on a data type is supported. - // 1 for supported, 0 otherwise. - ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); - // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* collComm, void* mhandle); - // Performs an asynchronous allreduce operation on the collective group. - // May return request == NULL if the call cannot be performed (or would block). - ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); - ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts, - size_t bytesPerRank, size_t windowOffset, size_t windowBytes, - void* sendMhandle, void** request); - ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData, - size_t bytesPerRank, size_t windowOffset, size_t windowBytes, - ncclDataType_t dataType, ncclRedOp_t redOp, - void* recvMhandle, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* size); - // Close and free collective comm objects - ncclResult_t (*closeColl)(void* collComm); - ncclResult_t (*closeListen)(void* listenComm); - - // Create a virtual NIC given the specified properties, which can be accessed at device index d - ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props); -} ncclCollNet_v9_t; - -typedef ncclCollNet_v9_t ncclCollNet_t; - -#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v9 - -typedef struct { - char* name; // Used mostly for logging. - char* pciPath; // Path to the PCI device in /sys. - uint64_t guid; // Unique identifier for the NIC chip. Important for - // cards with multiple PCI functions (Physical or virtual). - int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] - int regIsGlobal; // regMr is not tied to a particular comm - int speed; // Port speed in Mbps. - int port; // Port number. - float latency; // Network latency - int maxComms; // Maximum number of comms we can create - int maxRecvs; // Maximum number of grouped receives. - ncclNetDeviceType netDeviceType; // Network offload type - int netDeviceVersion; // Version number for network offload -} ncclNetProperties_v8_t; - -typedef struct { - // Name of the network (mainly for logs) - const char* name; - // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create a connection. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Connect to a handle and return a sending comm object for that peer. - // This call must not block for the connection to be established, and instead - // should return successfully with sendComm == NULL with the expectation that - // it will be called again until sendComm != NULL. - // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection - ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm); - // Finalize connection establishment after remote peer has called connect. - // This call must not block for the connection to be established, and instead - // should return successfully with recvComm == NULL with the expectation that - // it will be called again until recvComm != NULL. - // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection - ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm); - // Register/Deregister memory. Comm can be either a sendComm or a recvComm. - // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* comm, void* mhandle); - // Asynchronous send to a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); - // Asynchronous recv from a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* sizes); - // Close and free send/recv comm objects - ncclResult_t (*closeSend)(void* sendComm); - ncclResult_t (*closeRecv)(void* recvComm); - ncclResult_t (*closeListen)(void* listenComm); - - // Copy the given mhandle to a dptr in a format usable by this plugin's device code - ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); - - // Notify the plugin that a recv has completed by the device - ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); -} ncclNet_v8_t; - -typedef struct { - void* mhandle; - void* address; - uint32_t size; -} ncclNetSGE_v8_t; - -typedef struct { - // Name of the collective network (mainly for logs) - const char* name; - // Initialize the collective network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters capable of doing collective operations. - // If ndev returns 0, all other functions might be set to NULL. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create connections. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Create a group for collective operations. handles have been created - // using listen() above. rank indicates caller's rank in the collective network. - ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); - // Returns whether a reduction operation on a data type is supported. - // 1 for supported, 0 otherwise. - ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); - // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* collComm, void* mhandle); - // Performs an asynchronous allreduce operation on the collective group. - // May return request == NULL if the call cannot be performed (or would block). - ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); - ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts, - size_t bytesPerRank, size_t windowOffset, size_t windowBytes, - void* sendMhandle, void** request); - ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData, - size_t bytesPerRank, size_t windowOffset, size_t windowBytes, - ncclDataType_t dataType, ncclRedOp_t redOp, - void* recvMhandle, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* size); - // Close and free collective comm objects - ncclResult_t (*closeColl)(void* collComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclCollNet_v8_t; - -typedef struct { - char* name; // Used mostly for logging. - char* pciPath; // Path to the PCI device in /sys. - uint64_t guid; // Unique identifier for the NIC chip. Important for - // cards with multiple PCI functions (Physical or virtual). - int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] - int speed; // Port speed in Mbps. - int port; // Port number. - float latency; // Network latency - int maxComms; // Maximum number of comms we can create - int maxRecvs; // Maximum number of grouped receives. - ncclNetDeviceType netDeviceType; // Network offload type - int netDeviceVersion; // Version number for network offload -} ncclNetProperties_v7_t; - -typedef struct { - // Name of the network (mainly for logs) - const char* name; - // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create a connection. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Connect to a handle and return a sending comm object for that peer. - // This call must not block for the connection to be established, and instead - // should return successfully with sendComm == NULL with the expectation that - // it will be called again until sendComm != NULL. - // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection - ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm); - // Finalize connection establishment after remote peer has called connect. - // This call must not block for the connection to be established, and instead - // should return successfully with recvComm == NULL with the expectation that - // it will be called again until recvComm != NULL. - // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection - ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm); - // Register/Deregister memory. Comm can be either a sendComm or a recvComm. - // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* comm, void* mhandle); - // Asynchronous send to a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); - // Asynchronous recv from a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* sizes); - // Close and free send/recv comm objects - ncclResult_t (*closeSend)(void* sendComm); - ncclResult_t (*closeRecv)(void* recvComm); - ncclResult_t (*closeListen)(void* listenComm); - - // Copy the given mhandle to a dptr in a format usable by this plugin's device code - ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); - - // Notify the plugin that a recv has completed by the device - ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); -} ncclNet_v7_t; - -typedef struct { - // Name of the collective network (mainly for logs) - const char* name; - // Initialize the collective network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters capable of doing collective operations. - // If ndev returns 0, all other functions might be set to NULL. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create connections. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Create a group for collective operations. handles have been created - // using listen() above. rank indicates caller's rank in the collective network. - ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); - // Returns whether a reduction operation on a data type is supported. - // 1 for supported, 0 otherwise. - ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); - // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* collComm, void* mhandle); - // Performs an asynchronous allreduce operation on the collective group. - // May return request == NULL if the call cannot be performed (or would block). - ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* size); - // Close and free collective comm objects - ncclResult_t (*closeColl)(void* collComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclCollNet_v7_t; - -#define NCCL_NET_MAX_REQUESTS_V6 8 - -// v6 struct for backwards compatibility -typedef struct { - char* name; // Used mostly for logging. - char* pciPath; // Path to the PCI device in /sys. - uint64_t guid; // Unique identifier for the NIC chip. Important for - // cards with multiple PCI functions (Physical or virtual). - int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] - int speed; // Port speed in Mbps. - int port; // Port number. - float latency; // Network latency - int maxComms; // Maximum number of comms we can create - int maxRecvs; // Maximum number of grouped receives. -} ncclNetProperties_v6_t; - -typedef struct { - // Name of the network (mainly for logs) - const char* name; - // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create a connection. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Connect to a handle and return a sending comm object for that peer. - // This call must not block for the connection to be established, and instead - // should return successfully with sendComm == NULL with the expectation that - // it will be called again until sendComm != NULL. - ncclResult_t (*connect)(int dev, void* handle, void** sendComm); - // Finalize connection establishment after remote peer has called connect. - // This call must not block for the connection to be established, and instead - // should return successfully with recvComm == NULL with the expectation that - // it will be called again until recvComm != NULL. - ncclResult_t (*accept)(void* listenComm, void** recvComm); - // Register/Deregister memory. Comm can be either a sendComm or a recvComm. - // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* comm, void* mhandle); - // Asynchronous send to a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); - // Asynchronous recv from a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* sizes); - // Close and free send/recv comm objects - ncclResult_t (*closeSend)(void* sendComm); - ncclResult_t (*closeRecv)(void* recvComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclNet_v6_t; - -typedef struct { - // Name of the collective network (mainly for logs) - const char* name; - // Initialize the collective network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters capable of doing collective operations. - // If ndev returns 0, all other functions might be set to NULL. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create connections. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Create a group for collective operations. handles have been created - // using listen() above. rank indicates caller's rank in the collective network. - ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); - // Returns whether a reduction operation on a data type is supported. - // 1 for supported, 0 otherwise. - ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); - // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* collComm, void* mhandle); - // Performs an asynchronous allreduce operation on the collective group. - // May return request == NULL if the call cannot be performed (or would block). - ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* size); - // Close and free collective comm objects - ncclResult_t (*closeColl)(void* collComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclCollNet_v6_t; - -// v5 struct for backwards compatibility -typedef struct { - // Name of the network (mainly for logs) - const char* name; - // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create a connection. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Connect to a handle and return a sending comm object for that peer. - // This call must not block for the connection to be established, and instead - // should return successfully with sendComm == NULL with the expectation that - // it will be called again until sendComm != NULL. - ncclResult_t (*connect)(int dev, void* handle, void** sendComm); - // Finalize connection establishment after remote peer has called connect. - // This call must not block for the connection to be established, and instead - // should return successfully with recvComm == NULL with the expectation that - // it will be called again until recvComm != NULL. - ncclResult_t (*accept)(void* listenComm, void** recvComm); - // Register/Deregister memory. Comm can be either a sendComm or a recvComm. - // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); - ncclResult_t (*deregMr)(void* comm, void* mhandle); - // Asynchronous send to a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); - // Asynchronous recv from a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* sizes); - // Close and free send/recv comm objects - ncclResult_t (*closeSend)(void* sendComm); - ncclResult_t (*closeRecv)(void* recvComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclNet_v5_t; - -// v5 struct for backwards compatibility -typedef struct { - // Name of the collective network (mainly for logs) - const char* name; - // Initialize the collective network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters capable of doing collective operations. - // If ndev returns 0, all other functions might be set to NULL. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create connections. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Create a group for collective operations. handles have been created - // using listen() above. rank indicates caller's rank in the collective network. - ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); - // Returns whether a reduction operation on a data type is supported. - // 1 for supported, 0 otherwise. - ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); - // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); - ncclResult_t (*deregMr)(void* collComm, void* mhandle); - // Performs an asynchronous allreduce operation on the collective group. - // May return request == NULL if the call cannot be performed (or would block). - ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* size); - // Close and free collective comm objects - ncclResult_t (*closeColl)(void* collComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclCollNet_v5_t; - -#endif // end include guard diff --git a/src/include/nccl_profiler.h b/src/include/nccl_profiler.h deleted file mode 100644 index a8164d075..000000000 --- a/src/include/nccl_profiler.h +++ /dev/null @@ -1,235 +0,0 @@ -/************************************************************************* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_PROFILER_H_ -#define NCCL_PROFILER_H_ - -#include - -enum { - ncclProfileGroup = (1 << 0), // group event type - ncclProfileColl = (1 << 1), // host collective call event type - ncclProfileP2p = (1 << 2), // host point-to-point call event type - ncclProfileProxyOp = (1 << 3), // proxy operation event type - ncclProfileProxyStep = (1 << 4), // proxy step event type - ncclProfileProxyCtrl = (1 << 5), // proxy control event type -}; - -typedef struct { - uint8_t type; // event type descriptor: ncclProfileColl, ... - void* parentObj; // pointer to the profiler parent object (for coll is the group) - int rank; // originating rank - union { - struct { - const char* name; - uint64_t commHash; - uint64_t seqNumber; - const char* func; - void const* sendBuff; - void* recvBuff; - size_t count; - int root; - const char* datatype; - size_t trafficBytes; - uint8_t nMaxChannels; - uint8_t nWarps; - const char* algo; - const char* proto; - } coll; - - struct { - const char* name; - uint64_t commHash; - const char* func; - void* buff; - const char* datatype; - size_t count; - int peer; - } p2p; - - struct { - pid_t pid; // pid of the originating process - uint8_t channelId; // channel id for this proxy operation - int peer; // remote rank for send/recv - int nSteps; // number of steps for this proxy operation - int chunkSize; // amount of data transferred by this proxy operation - int isSend; - } proxyOp; - - struct { - int step; - } proxyStep; - }; -} ncclProfilerEventDescr_v2_t; - -typedef enum { - ncclProfilerProxyOpSendPosted, - ncclProfilerProxyOpSendRemFifoWait, - ncclProfilerProxyOpSendTransmitted, - ncclProfilerProxyOpSendDone, - ncclProfilerProxyOpRecvPosted, - ncclProfilerProxyOpRecvReceived, - ncclProfilerProxyOpRecvTransmitted, - ncclProfilerProxyOpRecvDone, - - /* Legacy proxy profiler states */ - ncclProfilerProxyStepSendGPUWait, - ncclProfilerProxyStepSendWait, - ncclProfilerProxyStepRecvWait, - ncclProfilerProxyStepRecvFlushWait, - ncclProfilerProxyStepRecvGPUWait, - - /* Legacy proxy control states */ - ncclProfilerProxyCtrlIdle, - ncclProfilerProxyCtrlActive, - ncclProfilerProxyCtrlSleep, - ncclProfilerProxyCtrlWakeup, - ncclProfilerProxyCtrlAppend, - ncclProfilerProxyCtrlAppendEnd, -} ncclProfilerEventState_v2_t; - -typedef union { - struct { - size_t transSize; - int steps; - } proxyOp; - - struct { - int appendedProxyOps; - } proxyCtrl; -} ncclProfilerEventStateArgs_v2_t; - -typedef struct { - const char* name; - - // init - initialize the profiler plugin - // Input - // - context : opaque profiler context object for separating profiler behavior across comms - // Output - // - eActivationMask: bitmask of active events set by the plugin - ncclResult_t (*init)(void** context, int* eActivationMask); - - // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset - // Input - // - context: opaque profiler context object - // - eDescr : pointer to ncclProfilerEventDescr_t object - // Output - // - eHandle: return event handle for supplied event descriptor object - ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr); - - // stopEvent - stop/finalize an event inside and event set - // Input - // - eHandle: handle to event object - ncclResult_t (*stopEvent)(void* eHandle); - - // recordEventState - record event state transitions and event attribute updates - // Input - // - eHandle : handle to event object created through startEvent - // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition - // - eState : event state transition - ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs); - - // finalize - finalize the profiler plugin - // Input - // - context: opaque profiler context object - ncclResult_t (*finalize)(void* context); -} ncclProfiler_v2_t; - -typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t; -typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t; -typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t; -typedef ncclProfiler_v2_t ncclProfiler_t; - -typedef struct { - uint8_t type; // event type descriptor: ncclProfileColl, ... - void* parentObj; // pointer to the profiler parent object (for coll is the group) - int rank; // originating rank - union { - struct { - const char* name; - uint64_t commHash; - uint64_t seqNumber; - uint8_t func; - void const* sendBuff; - void* recvBuff; - size_t count; - int root; - uint8_t datatype; - uint32_t op; - size_t trafficBytes; - uint8_t nMaxChannels; - uint8_t nWarps; - uint8_t algo; - uint8_t proto; - int isCollnet; - int isNvls; - } coll; - - struct { - const char* name; - uint64_t commHash; - uint8_t func; - void* buff; - uint8_t datatype; - size_t count; - int peer; - } p2p; - - struct { - pid_t pid; // pid of the originating process - uint8_t channelId; // channel id for this proxy operation - int peer; // remote rank for send/recv - int nSteps; // number of steps for this proxy operation - int chunkSize; // amount of data transferred by this proxy operation - int isSend; - } proxyOp; - - struct { - int step; - } proxyStep; - }; -} ncclProfilerEventDescr_v1_t; - -typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t; -typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t; - -typedef struct { - const char* name; - - // init - initialize the profiler plugin - // Input - // - context : opaque profiler context object for separating profiler behavior across comms - // Output - // - eActivationMask: bitmask of active events set by the plugin - ncclResult_t (*init)(void** context, int* eActivationMask); - - // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset - // Input - // - context: opaque profiler context object - // - eDescr : pointer to ncclProfilerEventDescr_t object - // Output - // - eHandle: return event handle for supplied event descriptor object - ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr); - - // stopEvent - stop/finalize an event inside and event set - // Input - // - eHandle: handle to event object - ncclResult_t (*stopEvent)(void* eHandle); - - // recordEventState - record event state transitions and event attribute updates - // Input - // - eHandle : handle to event object created through startEvent - // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition - // - eState : event state transition - ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs); - - // finalize - finalize the profiler plugin - // Input - // - context: opaque profiler context object - ncclResult_t (*finalize)(void* context); -} ncclProfiler_v1_t; - -#endif diff --git a/src/include/nccl_tuner.h b/src/include/nccl_tuner.h deleted file mode 100644 index 6e61118b9..000000000 --- a/src/include/nccl_tuner.h +++ /dev/null @@ -1,149 +0,0 @@ -/************************************************************************* - * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_TUNER_H_ -#define NCCL_TUNER_H_ - -#include "nccl.h" -#include "nccl_common.h" - -// API to be implemented by external tuner -typedef struct { - // Name of the tuner - const char* name; - - // Initializes tuner states. - // Inputs: - // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. - // - nNodes: number of nodes in current communicator. - // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. - // Outputs: - // - context: tuner context object - ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); - - // Gets info (algo, protocol, number of ctas and threads) for a given collective. - // Inputs: - // - context: tuner context object - // - collType: collective type , e.g., allreduce, allgather… - // - nBytes: collective size in bytes - // - numPipeOps: number of operations in the group - // - numAlgo: number of algorithms in collCostTable - // - numProto: number of protocols in collCostTable - // - regBuff: can register user buffer - // - // Outputs: - // - nChannels: number of channels (hence SMs) to be used. - // - // InOut: - // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. - // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). - // - // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the - // default tuning for the given collective. - // Also, the plugin is allowed to not set any output, or set only the - // algorithm and protocol, but not only the algorithm or only the protocol. - // Unset fields will be set automatically by NCCL. - ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, - int numPipeOps, float** collCostTable, int numAlgo, int numProto, - int regBuff, int* nChannels); - - // Terminates the plugin and cleans up any resources that the plugin allocated. - // context: tuner context object - ncclResult_t (*destroy)(void* context); -} ncclTuner_v4_t; - -typedef ncclTuner_v4_t ncclTuner_t; - -#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4" - -// API to be implemented by external tuner -typedef struct { - // Name of the tuner - const char* name; - - // Initializes tuner states. - // Inputs: - // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. - // - nNodes: number of nodes in current communicator. - // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. - // Outputs: - // - context: tuner context object - ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); - - // Gets info (algo, protocol, number of ctas and threads) for a given collective. - // Inputs: - // - context: tuner context object - // - collType: collective type , e.g., allreduce, allgather… - // - nBytes: collective size in bytes - // - numPipeOps: number of operations in the group - // - numAlgo: number of algorithms in collCostTable - // - numProto: number of protocols in collCostTable - // - // Outputs: - // - nChannels: number of channels (hence SMs) to be used. - // - // InOut: - // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. - // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). - // - // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the - // default tuning for the given collective. - // Also, the plugin is allowed to not set any output, or set only the - // algorithm and protocol, but not only the algorithm or only the protocol. - // Unset fields will be set automatically by NCCL. - ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, - int numPipeOps, float** collCostTable, int numAlgo, int numProto, - int* nChannels); - - // Terminates the plugin and cleans up any resources that the plugin allocated. - // context: tuner context object - ncclResult_t (*destroy)(void* context); -} ncclTuner_v3_t; - -// API to be implemented by external tuner -typedef struct { - // Name of the tuner - const char* name; - - // Initializes tuner states. - // Inputs: - // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. - // - nNodes: number of nodes in current communicator. - // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. - // Outputs: - // - context: tuner context object - ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); - - // Gets info (algo, protocol, number of ctas and threads) for a given collective. - // Inputs: - // - context: tuner context object - // - collType: collective type , e.g., allreduce, allgather… - // - nBytes: collective size in bytes - // - collNetTypeSupport: whether collnet supports this type - // - nvlsTypeSupport: whether nvlink sharp supports this time - // - numPipeOps: number of operations in the group - // - // Outputs: - // - algorithm: selected algorithm to be used for the given collective - // - protocol: selected protocol to be used for the give collective - // - nChannels: number of channels (hence SMs) to be used. - // - // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the - // default tuning for the given collective. - // Also, the plugin is allowed to not set any output, or set only the - // algorithm and protocol, but not only the algorithm or only the protocol. - // Unset fields will be set automatically by NCCL. - ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, - int collNetSupport, int nvlsSupport, int numPipeOps, - int* algorithm, int* protocol, int* nChannels); - - // Terminates the plugin and cleans up any resources that the plugin allocated. - // context: tuner context object - ncclResult_t (*destroy)(void* context); -} ncclTuner_v2_t; - -#endif diff --git a/src/include/net.h b/src/include/net.h index d1926ccd8..afc2d160e 100644 --- a/src/include/net.h +++ b/src/include/net.h @@ -18,7 +18,6 @@ ncclResult_t ncclNetPluginLoad(struct ncclComm* comm); ncclResult_t ncclNetPluginUnload(struct ncclComm* comm); ncclResult_t ncclNetInit(struct ncclComm* comm); ncclResult_t ncclNetFinalize(struct ncclComm* comm); -int ncclNetVersion(struct ncclComm* comm); // Test whether the current GPU support GPU Direct RDMA. ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport); diff --git a/src/include/net_device.h b/src/include/net_device.h index 5fae9b542..c3a79e35c 100644 --- a/src/include/net_device.h +++ b/src/include/net_device.h @@ -26,6 +26,7 @@ typedef struct { typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t; typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t; -typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t; +typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t; +typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t; #endif diff --git a/src/include/nvtx.h b/src/include/nvtx.h index 5d00f0792..2c18b36b9 100644 --- a/src/include/nvtx.h +++ b/src/include/nvtx.h @@ -31,9 +31,10 @@ #define NVTX_SID_CommInitRankScalable 12 // same schema as NVTX_SID_CommInitRank #define NVTX_SID_CommSplit 13 #define NVTX_SID_CommFinalize 14 +// When adding new schema IDs, DO NOT re-use/overlap with the enum schema ID below! // Define static schema ID for the reduction operation. -#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 14 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START +#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 15 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START extern const nvtxDomainHandle_t ncclNvtxDomainHandle; diff --git a/src/include/plugin/nccl_net.h b/src/include/plugin/nccl_net.h new file mode 100644 index 000000000..d57aad5a9 --- /dev/null +++ b/src/include/plugin/nccl_net.h @@ -0,0 +1,54 @@ +/************************************************************************* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_NET_H_ +#define NCCL_NET_H_ + +#include "nccl.h" +#include "nccl_common.h" +#include "net_device.h" +#include + +#define NCCL_NET_HANDLE_MAXSIZE 128 +//Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties +#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) +#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1 + +#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two. +#define MAX_COLLNET_SIZE (512*1024*1024L) //Set for initial collent plugins when size was not dynamically queried + +#define NCCL_PTR_HOST 0x1 +#define NCCL_PTR_CUDA 0x2 +#define NCCL_PTR_DMABUF 0x4 + +// Maximum number of requests per comm object +#define NCCL_NET_MAX_REQUESTS 32 + +// Max number of ncclNet objects which can live in the same process +#define NCCL_NET_MAX_PLUGINS 3 + +// NCCL core profiler callback for network defined events instrumentation +typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData); + +#include "net/net_v10.h" +#include "net/net_v9.h" +#include "net/net_v8.h" +#include "net/net_v7.h" +#include "net/net_v6.h" + +typedef ncclNet_v10_t ncclNet_t; +typedef ncclCollNet_v10_t ncclCollNet_t; +typedef ncclNetSGE_v10_t ncclNetSGE_t; +typedef ncclNetProperties_v10_t ncclNetProperties_t; +typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t; +typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t; + +#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V10 + +#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v10 +#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v10 + +#endif // end include guard diff --git a/src/include/plugin/nccl_profiler.h b/src/include/plugin/nccl_profiler.h new file mode 100644 index 000000000..34cf9a927 --- /dev/null +++ b/src/include/plugin/nccl_profiler.h @@ -0,0 +1,69 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_PROFILER_H_ +#define NCCL_PROFILER_H_ + +enum { + ncclProfileGroup = (1 << 0), // group event type + ncclProfileColl = (1 << 1), // host collective call event type + ncclProfileP2p = (1 << 2), // host point-to-point call event type + ncclProfileProxyOp = (1 << 3), // proxy operation event type + ncclProfileProxyStep = (1 << 4), // proxy step event type + ncclProfileProxyCtrl = (1 << 5), // proxy control event type + ncclProfileKernelCh = (1 << 6), // kernel channel event type + ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events +}; + +typedef enum { + ncclProfilerProxyOpSendPosted, + ncclProfilerProxyOpSendRemFifoWait, + ncclProfilerProxyOpSendTransmitted, + ncclProfilerProxyOpSendDone, + ncclProfilerProxyOpRecvPosted, + ncclProfilerProxyOpRecvReceived, + ncclProfilerProxyOpRecvTransmitted, + ncclProfilerProxyOpRecvDone, + + /* Legacy proxy profiler states */ + ncclProfilerProxyStepSendGPUWait, + ncclProfilerProxyStepSendWait, + ncclProfilerProxyStepRecvWait, + ncclProfilerProxyStepRecvFlushWait, + ncclProfilerProxyStepRecvGPUWait, + + /* Legacy proxy control states */ + ncclProfilerProxyCtrlIdle, + ncclProfilerProxyCtrlActive, + ncclProfilerProxyCtrlSleep, + ncclProfilerProxyCtrlWakeup, + ncclProfilerProxyCtrlAppend, + ncclProfilerProxyCtrlAppendEnd, +} ncclProfilerEventState_t; + +typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t; +typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t; +typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t; + +#include +#include "profiler/profiler_v3.h" +#include "profiler/profiler_v2.h" +#include "profiler/profiler_v1.h" + +typedef ncclProfiler_v3_t ncclProfiler_t; +typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t; +typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t; + +#define NCCL_PROFILER_NET_VER_BITS (16) +#define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS) +#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS) + +typedef enum { + NCCL_PROFILER_NET_TYPE_IB = (1U << NCCL_PROFILER_NET_VER_BITS), + NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS), +} ncclProfilerNetType; + +#endif diff --git a/src/include/plugin/nccl_tuner.h b/src/include/plugin/nccl_tuner.h new file mode 100644 index 000000000..f2401890d --- /dev/null +++ b/src/include/plugin/nccl_tuner.h @@ -0,0 +1,22 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_TUNER_H_ +#define NCCL_TUNER_H_ + +#include "nccl.h" +#include "nccl_common.h" + +#include "tuner/tuner_v4.h" +#include "tuner/tuner_v3.h" +#include "tuner/tuner_v2.h" + +typedef ncclTuner_v4_t ncclTuner_t; + +#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4" + +#endif diff --git a/src/include/plugin/net/net_v10.h b/src/include/plugin/net/net_v10.h new file mode 100644 index 000000000..ada6d482e --- /dev/null +++ b/src/include/plugin/net/net_v10.h @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NET_V10_H_ +#define NET_V10_H_ + +#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4 + +typedef struct { + int ndevs; + int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10]; +} ncclNetVDeviceProps_v10_t; + +#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1 + +typedef struct { + // Plugin-specific TC value + int trafficClass; +} ncclNetCommConfig_v10_t; + + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int regIsGlobal; // regMr is not tied to a particular comm + int forceFlush; // Force a flush on receives + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload + ncclNetVDeviceProps_v10_t vProps; + size_t maxP2pBytes; // Max transfer size for point-to-point operations + size_t maxCollBytes; // Max transfer size for collective operations +} ncclNetProperties_v10_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); + + // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller + // what index this new vNIC exists at + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props); +} ncclNet_v10_t; + +typedef struct { + void* mhandle; + void* address; + size_t size; +} ncclNetSGE_v10_t; + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v10_t* recvParts, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + void* sendMhandle, void** request); + ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v10_t* sendParts, void* recvData, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + ncclDataType_t dataType, ncclRedOp_t redOp, + void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Create a virtual NIC given the specified properties, which can be accessed at device index d + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props); +} ncclCollNet_v10_t; + +#endif // end include guard diff --git a/src/include/plugin/net/net_v6.h b/src/include/plugin/net/net_v6.h new file mode 100644 index 000000000..99445ce17 --- /dev/null +++ b/src/include/plugin/net/net_v6.h @@ -0,0 +1,113 @@ +/************************************************************************* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_V6_H_ +#define NET_V6_H_ + +#define NCCL_NET_MAX_REQUESTS_V6 8 + +// v6 struct for backwards compatibility +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. +} ncclNetProperties_v6_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v6_t; + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v6_t; + +#endif diff --git a/src/include/plugin/net/net_v7.h b/src/include/plugin/net/net_v7.h new file mode 100644 index 000000000..e9b19dec8 --- /dev/null +++ b/src/include/plugin/net/net_v7.h @@ -0,0 +1,120 @@ +/************************************************************************* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_V7_H_ +#define NET_V7_H_ + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload +} ncclNetProperties_v7_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); +} ncclNet_v7_t; + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v7_t; + +#endif diff --git a/src/include/plugin/net/net_v8.h b/src/include/plugin/net/net_v8.h new file mode 100644 index 000000000..a178132fe --- /dev/null +++ b/src/include/plugin/net/net_v8.h @@ -0,0 +1,134 @@ +/************************************************************************* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_V8_H_ +#define NET_V8_H_ + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int regIsGlobal; // regMr is not tied to a particular comm + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload +} ncclNetProperties_v8_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); +} ncclNet_v8_t; + +typedef struct { + void* mhandle; + void* address; + uint32_t size; +} ncclNetSGE_v8_t; + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + void* sendMhandle, void** request); + ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + ncclDataType_t dataType, ncclRedOp_t redOp, + void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v8_t; + +#endif diff --git a/src/include/plugin/net/net_v9.h b/src/include/plugin/net/net_v9.h new file mode 100644 index 000000000..ce9d91748 --- /dev/null +++ b/src/include/plugin/net/net_v9.h @@ -0,0 +1,152 @@ +/************************************************************************* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_V9_H_ +#define NET_V9_H_ + +#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4 + +typedef struct { + int ndevs; + int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9]; +} ncclNetVDeviceProps_v9_t; + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int regIsGlobal; // regMr is not tied to a particular comm + int forceFlush; // Force a flush on receives + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload + ncclNetVDeviceProps_v9_t vProps; + size_t maxP2pBytes; // Max transfer size for point-to-point operations + size_t maxCollBytes; // Max transfer size for collective operations +} ncclNetProperties_v9_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v9_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v9_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); + + // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller + // what index this new vNIC exists at + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props); +} ncclNet_v9_t; + +typedef struct { + void* mhandle; + void* address; + size_t size; +} ncclNetSGE_v9_t; + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + void* sendMhandle, void** request); + ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + ncclDataType_t dataType, ncclRedOp_t redOp, + void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Create a virtual NIC given the specified properties, which can be accessed at device index d + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props); +} ncclCollNet_v9_t; + +#endif // end include guard diff --git a/src/include/plugin/plugin.h b/src/include/plugin/plugin.h new file mode 100644 index 000000000..7336c34d9 --- /dev/null +++ b/src/include/plugin/plugin.h @@ -0,0 +1,18 @@ +/************************************************************************* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_PLUGIN_H_ +#define NCCL_PLUGIN_H_ + +#include "nccl.h" + +void* ncclOpenNetPluginLib(const char* name); +void* ncclOpenTunerPluginLib(const char* name); +void* ncclOpenProfilerPluginLib(const char* name); +void* ncclGetNetPluginLib(void); +ncclResult_t ncclClosePluginLib(void* handle); + +#endif diff --git a/src/include/plugin/profiler/net_ib.h b/src/include/plugin/profiler/net_ib.h new file mode 100644 index 000000000..2ac6d5c97 --- /dev/null +++ b/src/include/plugin/profiler/net_ib.h @@ -0,0 +1,13 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_IB_H_ +#define NET_IB_H_ + +#include "nccl_profiler.h" +#include "net_ib_v1.h" + +#endif diff --git a/src/include/plugin/profiler/net_ib_v1.h b/src/include/plugin/profiler/net_ib_v1.h new file mode 100644 index 000000000..f142de5f5 --- /dev/null +++ b/src/include/plugin/profiler/net_ib_v1.h @@ -0,0 +1,34 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_IB_V1_H_ +#define NET_IB_V1_H_ + +#define NCCL_PROFILER_NET_IB_VER 1 + +enum { + ncclProfileQp = (1 << 0), +}; + +// The data structure version is encoded in the plugin identifier bitmask and +// passed to NCCL core through the profiler callback. NCCL copies the plugin +// identifier in the event descriptor before calling the profiler startEvent +// function. The profiler should inspect the plugin id to find out the source +// plugin as well as the version of the event struct +typedef struct { + uint8_t type; // event type (plugin defined) + union { + struct { + int device; // network device id + uint64_t wr_id; // work request id + int opcode; // ibv opcode + int qpNum; // QP number + size_t length; // work request data length + } qp; + }; +} ncclProfilerNetIbDescr_v1_t; + +#endif diff --git a/src/include/plugin/profiler/net_socket.h b/src/include/plugin/profiler/net_socket.h new file mode 100644 index 000000000..9f5749633 --- /dev/null +++ b/src/include/plugin/profiler/net_socket.h @@ -0,0 +1,13 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_SOCKET_H_ +#define NET_SOCKET_H_ + +#include "nccl_profiler.h" +#include "net_socket_v1.h" + +#endif diff --git a/src/include/plugin/profiler/net_socket_v1.h b/src/include/plugin/profiler/net_socket_v1.h new file mode 100644 index 000000000..0cb664f20 --- /dev/null +++ b/src/include/plugin/profiler/net_socket_v1.h @@ -0,0 +1,32 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_SOCKET_V1_H_ +#define NET_SOCKET_V1_H_ + +#define NCCL_PROFILER_NET_SOCKET_VER 1 + +enum { + ncclProfileSocket = (1 << 0), +}; + +// The data structure version is encoded in the plugin identifier bitmask and +// passed to NCCL core through the profiler callback. NCCL copies the plugin +// identifier in the event descriptor before calling the profiler startEvent +// function. The profiler should inspect the plugin id to find out the source +// plugin as well as the version of the event struct +typedef struct { + uint8_t type; // event type (plugin defined) + union { + struct { + int fd; + int op; + size_t length; + } sock; + }; +} ncclProfilerNetSockDescr_v1_t; + +#endif diff --git a/src/include/plugin/profiler/profiler_v1.h b/src/include/plugin/profiler/profiler_v1.h new file mode 100644 index 000000000..3b6710240 --- /dev/null +++ b/src/include/plugin/profiler/profiler_v1.h @@ -0,0 +1,107 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_V1_H_ +#define PROFILER_V1_H_ + +typedef struct { + uint8_t type; // event type descriptor: ncclProfileColl, ... + void* parentObj; // pointer to the profiler parent object (for coll is the group) + int rank; // originating rank + union { + struct { + const char* name; + uint64_t commHash; + uint64_t seqNumber; + uint8_t func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + uint8_t datatype; + uint32_t op; + size_t trafficBytes; + uint8_t nMaxChannels; + uint8_t nWarps; + uint8_t algo; + uint8_t proto; + int isCollnet; + int isNvls; + } coll; + + struct { + const char* name; + uint64_t commHash; + uint8_t func; + void* buff; + uint8_t datatype; + size_t count; + int peer; + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + }; +} ncclProfilerEventDescr_v1_t; + +typedef union { + struct { + size_t transSize; + int steps; + } proxyOp; + + struct { + int appendedProxyOps; + } proxyCtrl; +} ncclProfilerEventStateArgs_v1_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, int* eActivationMask); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v1_t; + +#endif diff --git a/src/include/plugin/profiler/profiler_v2.h b/src/include/plugin/profiler/profiler_v2.h new file mode 100644 index 000000000..146152a7a --- /dev/null +++ b/src/include/plugin/profiler/profiler_v2.h @@ -0,0 +1,104 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_V2_H_ +#define PROFILER_V2_H_ + +typedef struct { + uint8_t type; // event type descriptor: ncclProfileColl, ... + void* parentObj; // pointer to the profiler parent object (for coll is the group) + int rank; // originating rank + union { + struct { + const char* name; + uint64_t commHash; + uint64_t seqNumber; + const char* func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + const char* datatype; + size_t trafficBytes; + uint8_t nMaxChannels; + uint8_t nWarps; + const char* algo; + const char* proto; + } coll; + + struct { + const char* name; + uint64_t commHash; + const char* func; + void* buff; + const char* datatype; + size_t count; + int peer; + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + }; +} ncclProfilerEventDescr_v2_t; + +typedef union { + struct { + size_t transSize; + int steps; + } proxyOp; + + struct { + int appendedProxyOps; + } proxyCtrl; +} ncclProfilerEventStateArgs_v2_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, int* eActivationMask); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v2_t; + +#endif diff --git a/src/include/plugin/profiler/profiler_v3.h b/src/include/plugin/profiler/profiler_v3.h new file mode 100644 index 000000000..10c50594f --- /dev/null +++ b/src/include/plugin/profiler/profiler_v3.h @@ -0,0 +1,112 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_V3_H_ +#define PROFILER_V3_H_ + +typedef struct { + uint8_t type; // event type descriptor: ncclProfileColl, ... + void* parentObj; // pointer to the profiler parent object (for coll is the group) + int rank; // originating rank + union { + struct { + const char* name; + uint64_t commHash; + uint64_t seqNumber; + const char* func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + const char* datatype; + uint8_t nMaxChannels; + uint8_t nWarps; + const char* algo; + const char* proto; + } coll; + + struct { + const char* name; + uint64_t commHash; + const char* func; + void* buff; + const char* datatype; + size_t count; + int peer; + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + + struct { + uint8_t channelId; + } kernelCh; + + struct { + int64_t id; + void* data; + } netPlugin; + }; +} ncclProfilerEventDescr_v3_t; + +typedef union { + struct { + size_t transSize; + int steps; + } proxyOp; + + struct { + int appendedProxyOps; + } proxyCtrl; +} ncclProfilerEventStateArgs_v3_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, int* eActivationMask); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v3_t; + +#endif diff --git a/src/include/plugin/tuner/tuner_v2.h b/src/include/plugin/tuner/tuner_v2.h new file mode 100644 index 000000000..ec96f6057 --- /dev/null +++ b/src/include/plugin/tuner/tuner_v2.h @@ -0,0 +1,53 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef TUNER_V2_H_ +#define TUNER_V2_H_ + +// API to be implemented by external tuner +typedef struct { + // Name of the tuner + const char* name; + + // Initializes tuner states. + // Inputs: + // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. + // - nNodes: number of nodes in current communicator. + // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. + // Outputs: + // - context: tuner context object + ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); + + // Gets info (algo, protocol, number of ctas and threads) for a given collective. + // Inputs: + // - context: tuner context object + // - collType: collective type , e.g., allreduce, allgather… + // - nBytes: collective size in bytes + // - collNetTypeSupport: whether collnet supports this type + // - nvlsTypeSupport: whether nvlink sharp supports this time + // - numPipeOps: number of operations in the group + // + // Outputs: + // - algorithm: selected algorithm to be used for the given collective + // - protocol: selected protocol to be used for the give collective + // - nChannels: number of channels (hence SMs) to be used. + // + // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the + // default tuning for the given collective. + // Also, the plugin is allowed to not set any output, or set only the + // algorithm and protocol, but not only the algorithm or only the protocol. + // Unset fields will be set automatically by NCCL. + ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, + int collNetSupport, int nvlsSupport, int numPipeOps, + int* algorithm, int* protocol, int* nChannels); + + // Terminates the plugin and cleans up any resources that the plugin allocated. + // context: tuner context object + ncclResult_t (*destroy)(void* context); +} ncclTuner_v2_t; + +#endif diff --git a/src/include/plugin/tuner/tuner_v3.h b/src/include/plugin/tuner/tuner_v3.h new file mode 100644 index 000000000..4fa10e825 --- /dev/null +++ b/src/include/plugin/tuner/tuner_v3.h @@ -0,0 +1,55 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef TUNER_V3_H_ +#define TUNER_V3_H_ + +// API to be implemented by external tuner +typedef struct { + // Name of the tuner + const char* name; + + // Initializes tuner states. + // Inputs: + // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. + // - nNodes: number of nodes in current communicator. + // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. + // Outputs: + // - context: tuner context object + ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); + + // Gets info (algo, protocol, number of ctas and threads) for a given collective. + // Inputs: + // - context: tuner context object + // - collType: collective type , e.g., allreduce, allgather… + // - nBytes: collective size in bytes + // - numPipeOps: number of operations in the group + // - numAlgo: number of algorithms in collCostTable + // - numProto: number of protocols in collCostTable + // + // Outputs: + // - nChannels: number of channels (hence SMs) to be used. + // + // InOut: + // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. + // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). + // + // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the + // default tuning for the given collective. + // Also, the plugin is allowed to not set any output, or set only the + // algorithm and protocol, but not only the algorithm or only the protocol. + // Unset fields will be set automatically by NCCL. + ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, + int numPipeOps, float** collCostTable, int numAlgo, int numProto, + int* nChannels); + + // Terminates the plugin and cleans up any resources that the plugin allocated. + // context: tuner context object + ncclResult_t (*destroy)(void* context); +} ncclTuner_v3_t; + +#endif diff --git a/src/include/plugin/tuner/tuner_v4.h b/src/include/plugin/tuner/tuner_v4.h new file mode 100644 index 000000000..a4b38a0a3 --- /dev/null +++ b/src/include/plugin/tuner/tuner_v4.h @@ -0,0 +1,56 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef TUNER_V4_H_ +#define TUNER_V4_H_ + +// API to be implemented by external tuner +typedef struct { + // Name of the tuner + const char* name; + + // Initializes tuner states. + // Inputs: + // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. + // - nNodes: number of nodes in current communicator. + // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. + // Outputs: + // - context: tuner context object + ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); + + // Gets info (algo, protocol, number of ctas and threads) for a given collective. + // Inputs: + // - context: tuner context object + // - collType: collective type , e.g., allreduce, allgather… + // - nBytes: collective size in bytes + // - numPipeOps: number of operations in the group + // - numAlgo: number of algorithms in collCostTable + // - numProto: number of protocols in collCostTable + // - regBuff: can register user buffer + // + // Outputs: + // - nChannels: number of channels (hence SMs) to be used. + // + // InOut: + // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. + // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). + // + // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the + // default tuning for the given collective. + // Also, the plugin is allowed to not set any output, or set only the + // algorithm and protocol, but not only the algorithm or only the protocol. + // Unset fields will be set automatically by NCCL. + ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, + int numPipeOps, float** collCostTable, int numAlgo, int numProto, + int regBuff, int* nChannels); + + // Terminates the plugin and cleans up any resources that the plugin allocated. + // context: tuner context object + ncclResult_t (*destroy)(void* context); +} ncclTuner_v4_t; + +#endif diff --git a/src/include/profiler.h b/src/include/profiler.h index 2b7efe0f6..8d4107963 100644 --- a/src/include/profiler.h +++ b/src/include/profiler.h @@ -17,6 +17,18 @@ struct ncclTaskP2p; struct ncclInfo; struct ncclComm; struct ncclProxyOp; +struct ncclProxyConnector; + +struct ncclProfilerProxy { + bool initialized; + uint64_t* workStarted/*[MAXCHANNELS]*/; + uint64_t* workCompleted/*[MAXCHANNELS]*/; + uint64_t workCounter[MAXCHANNELS]; // host work counter + struct ncclProxyConnector sendProxyConn[MAXCHANNELS]; + struct ncclProxyConnector recvProxyConn[MAXCHANNELS]; +}; + +extern int ncclProfilerEventMask; // Plugin Init/Finalize Wrappers ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm); @@ -44,6 +56,10 @@ ncclResult_t ncclProfilerStopProxyStepEvent(int sub, struct ncclProxyArgs* args, ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle); ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle); +// Kernel Channel Start/Stop Event Wrappers +ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s); +ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s); + // Record Event Wrappers ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState); ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState); @@ -51,5 +67,9 @@ ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, n // Profiler utility functions ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op); +bool ncclProfilerNeedsProxy(struct ncclComm* comm, struct ncclProxyOp* op); + +// Profiler callback for network plugin +ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData); #endif diff --git a/src/include/proxy.h b/src/include/proxy.h index c97a4d7ce..225acb22d 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -32,7 +32,8 @@ typedef enum : uint8_t { ncclPatternPatUp, ncclPatternPatDown, ncclPatternSend, - ncclPatternRecv + ncclPatternRecv, + ncclPatternProfiler, } ncclPattern_t; enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }; @@ -93,6 +94,7 @@ struct ncclProxyOp { int peer; pid_t pid; void* profilerContext; + uint64_t workCounter; struct ncclProxyOp *enqNext; }; @@ -129,12 +131,15 @@ struct ncclProxySubArgs { // Profiler plugin int eActivationMask; int rank; + uint64_t profilerSteps; pid_t pid; void* profilerContext; void* taskEventHandle; void* opEventHandle; + void* kernelEventHandle; void* stepEventHandles[NCCL_STEPS]; size_t transSize; + uint64_t workCounter; void* recvRequestsCache[NCCL_STEPS]; int recvRequestsSubCount; diff --git a/src/include/ras.h b/src/include/ras.h index 7909b3dc8..d27a543e2 100644 --- a/src/include/ras.h +++ b/src/include/ras.h @@ -15,6 +15,8 @@ struct rasRankInit { pid_t pid; int cudaDev; int nvmlDev; + uint64_t hostHash; + uint64_t pidHash; }; ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank); diff --git a/src/include/register.h b/src/include/register.h index 740a645f4..143f41bc9 100644 --- a/src/include/register.h +++ b/src/include/register.h @@ -42,7 +42,7 @@ struct ncclReg { uintptr_t baseAddr; size_t baseSize; CUdeviceptr regAddr; - size_t regSize; + size_t regUCSize, regMCSize; int dev; CUmemGenericAllocationHandle mcHandle; uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */ diff --git a/src/include/shm.h b/src/include/shm.h index b519e5dc9..223d87346 100644 --- a/src/include/shm.h +++ b/src/include/shm.h @@ -14,7 +14,6 @@ struct shmCuIpc { CUmemFabricHandle handle; CUmemGenericAllocationHandle data; }; - int tpProxyRank; void *ptr; size_t size; }; @@ -30,8 +29,8 @@ struct shmIpcDesc { typedef struct shmIpcDesc ncclShmIpcDesc_t; -ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr); -ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut); +ncclResult_t ncclShmAllocateShareableBuffer(size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr); +ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, int proxyRank, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut); ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc); #endif diff --git a/src/include/socket.h b/src/include/socket.h index f0a3237ce..ffa148091 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -96,5 +96,5 @@ ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size); ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize); ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking); ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how); -ncclResult_t ncclSocketClose(struct ncclSocket* sock); +ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait = false); #endif diff --git a/src/include/strongstream.h b/src/include/strongstream.h index 0984dfe57..c56d5aca5 100644 --- a/src/include/strongstream.h +++ b/src/include/strongstream.h @@ -10,13 +10,24 @@ #include "nccl.h" #include "checks.h" +#include +#include #include +// ncclCudaContext: wraps a CUDA context with per-context state. +struct ncclCudaContext; + +// Get a ncclCudaContext to track the currently active CUDA context. +ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out); +// Drop reference. +void ncclCudaContextDrop(struct ncclCudaContext* cxt); + /* ncclCudaGraph: Wraps a cudaGraph_t so that we can support pre-graph CUDA runtimes * easily. */ struct ncclCudaGraph { #if CUDART_VERSION >= 11030 + cudaStream_t origin; cudaGraph_t graph; unsigned long long graphId; #endif @@ -25,6 +36,7 @@ struct ncclCudaGraph { inline struct ncclCudaGraph ncclCudaGraphNone() { struct ncclCudaGraph tmp; #if CUDART_VERSION >= 11030 + tmp.origin = nullptr; tmp.graph = nullptr; tmp.graphId = ULLONG_MAX; #endif @@ -33,7 +45,7 @@ inline struct ncclCudaGraph ncclCudaGraphNone() { inline bool ncclCudaGraphValid(struct ncclCudaGraph graph) { #if CUDART_VERSION >= 11030 - return graph.graph != nullptr; + return graph.graphId != ULLONG_MAX; #else return false; #endif @@ -57,60 +69,37 @@ ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t * streams unfit for the use of serializing access to a persistent resource. * Strong streams have been introduced to address this need. * - * - All updates to a strong stream must be enclosed by a Acquire/Release pair. + * All updates to a strong stream must be enclosed by a Acquire/Release pair. * - * - The Acquire, Release, and all updates take a ncclCudaGraph parameter - * indicating the currently capturing graph (or none). This parameter must be - * the same for the entire sequence of {Acquire; ...; Release}. + * Acquire retrieves a "work" stream (cudaStream_t) which may be used to add + * work. * - * - An {Acquire; ...; Release} sequence must not be concurrent with any - * other operations against the strong stream including graph launches which - * reference this stream. + * Release publishes the work streams work into the strong stream. The Release + * must be issued by the same thread that did the Acquire. */ struct ncclStrongStream; ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss); ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss); -// Acquire-fence the strong stream. +// Acquire the strong stream. Upon return `*workStream` will be usable to add work. +// `concurrent` indicates if other threads may be using the strong stream. ncclResult_t ncclStrongStreamAcquire( - struct ncclCudaGraph graph, struct ncclStrongStream* ss + struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent, cudaStream_t* workStream ); -// Acquire-fence the strong stream assuming no graph is capturing. This permits -// the caller to enqueue directly to the `ss->cudaStream` member using native CUDA -// calls. Strong stream still must be released via: -// ncclStrongStreamRelease(ncclCudaGraphNone(), ss); -ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss); - -// Release-fence of the strong stream. -ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss); - -// Add a host launch to the stream. -ncclResult_t ncclStrongStreamLaunchHost( - struct ncclCudaGraph graph, struct ncclStrongStream* ss, - cudaHostFn_t fn, void* arg -); -// Add a kernel launch to the stream. -ncclResult_t ncclStrongStreamLaunchKernel( - struct ncclCudaGraph graph, struct ncclStrongStream* ss, - void* fn, dim3 grid, dim3 block, void** args, size_t sharedMemBytes +// Get the workStream for an already acquired strong stream. +// `concurrent` indicates if other threads may be using the strong stream. +ncclResult_t ncclStrongStreamAcquiredWorkStream( + struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent, cudaStream_t* workStream ); -// Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired. -// `b_subsumes_a` indicates that all work in `a` is already present in `b`, thus -// we want to fast-forward `a` to be a clone of `b`. Knowing this permits the -// implementation to induce few graph dependencies. -ncclResult_t ncclStrongStreamWaitStream( - struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b, bool b_subsumes_a=false -); -// `b` must be capturing within `graph`. -ncclResult_t ncclStrongStreamWaitStream( - struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b, bool b_subsumes_a=false -); -// `a` must be capturing within `graph`. -ncclResult_t ncclStrongStreamWaitStream( - struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b, bool b_subsumes_a=false +// Release of the strong stream. +// `concurrent` indicates if other threads may be using the strong stream. +ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent); + +ncclResult_t ncclStreamWaitStream( + cudaStream_t a, cudaStream_t b, cudaEvent_t scratchEvent ); // Synchrnoization does not need the strong stream to be acquired. @@ -118,23 +107,28 @@ ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss); //////////////////////////////////////////////////////////////////////////////// -struct ncclStrongStreamGraph; // internal to ncclStrongStream +struct ncclStrongStreamCapture; // internal to ncclStrongStream struct ncclStrongStream { - // Used when not graph capturing. - cudaStream_t cudaStream; + // The stream to use for non-captured work. + cudaStream_t liveStream; + void* liveAcquiredBy; #if CUDART_VERSION >= 11030 + // This stream ever appeared in a graph capture. + bool everCaptured; + pthread_mutex_t lock; + struct ncclStrongStreamCapture* captureHead; // The event used to establish order between graphs and streams. During acquire // this event is waited on, during release it is recorded to. cudaEvent_t serialEvent; - // This stream ever appeared in a graph capture. - bool everCaptured; - // Tracks whether serialEvent needs to be recorded to upon Release(). - bool serialEventNeedsRecord; - struct ncclStrongStreamGraph* graphHead; -#else - cudaEvent_t scratchEvent; #endif }; +struct ncclCudaContext { + struct ncclCudaContext* next; + CUcontext hcontext; + int refCount; + struct ncclStrongStream launchOrder; +}; + #endif diff --git a/src/include/transport.h b/src/include/transport.h index 37187f69e..c563fbbd6 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -18,6 +18,7 @@ #define TRANSPORT_SHM 1 #define TRANSPORT_NET 2 #define TRANSPORT_COLLNET 3 +#define TRANSPORT_PROFILER 4 #include "proxy.h" #include "comm.h" @@ -26,6 +27,7 @@ extern struct ncclTransport p2pTransport; extern struct ncclTransport shmTransport; extern struct ncclTransport netTransport; extern struct ncclTransport collNetTransport; +extern struct ncclTransport profilerTransport; extern struct ncclTransport* ncclTransports[]; // Forward declarations @@ -65,8 +67,10 @@ struct ncclNvlsSharedRes { CUmulticastObjectProp signalProp; CUmemAccessDesc accessDesc; int dev; - size_t buffSize; - size_t creditSize; + size_t creditUCSize; + size_t creditMCSize; + size_t buffUCSize; + size_t buffMCSize; CUmemGenericAllocationHandle mcBuffHandle; // Multicast handle for NVLS buffer CUmemGenericAllocationHandle mcCreditHandle; // Multicast handle for NVLS credit buffer char* mcBuff; // Multicast NVLS buffer address @@ -123,7 +127,7 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm); ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm); ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue* cleanupQueue, int* nCleanupQueueElts); ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv); -ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size); +ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t ucsize, size_t mcsize); ncclResult_t ncclNvlsFree(struct ncclComm* comm); enum { collNetRecv=0, collNetSend=1 }; diff --git a/src/init.cc b/src/init.cc index 3e218ab07..46b02e65e 100644 --- a/src/init.cc +++ b/src/init.cc @@ -51,17 +51,6 @@ NCCL_PARAM(RuntimeConnect, "RUNTIME_CONNECT", 1); static ncclResult_t commReclaim(ncclComm_t comm); -static uint64_t hashUniqueId(ncclUniqueId const &id) { - char const *bytes = (char const*)&id; - uint64_t h = 0xdeadbeef; - for(int i=0; i < (int)sizeof(ncclUniqueId); i++) { - h ^= h >> 32; - h *= 0x8db3db47fa2994ad; - h += bytes[i]; - } - return h; -} - // GDRCOPY support: Off by default NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0); @@ -111,7 +100,7 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* out) { memset(out, 0, sizeof(*out)); // copy to avoid alignment mismatch memcpy(out, &handle, sizeof(handle)); - TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out)); + TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)getHash(out->internal, NCCL_UNIQUE_ID_BYTES)); return ncclSuccess; } @@ -232,6 +221,8 @@ static ncclResult_t commFree(ncclComm_t comm) { free(comm->sharedRes->tpRankToLocalRank); NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->hostStream)); NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->deviceStream)); + CUDACHECK(cudaEventDestroy(comm->sharedRes->launchEvent)); + CUDACHECK(cudaEventDestroy(comm->sharedRes->scratchEvent)); NCCLCHECK(ncclProxyDestroy(comm)); free(comm->sharedRes); } @@ -268,6 +259,9 @@ static ncclResult_t commFree(ncclComm_t comm) { NCCLCHECK(ncclProfilerPluginFinalize(comm)); NCCLCHECK(ncclNetFinalize(comm)); NCCLCHECK(ncclNetPluginUnload(comm)); + + ncclCudaContextDrop(comm->context); + free(comm); return ncclSuccess; @@ -309,17 +303,12 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) { ncclGroupJobAbort(comm->groupJob); } else { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); - if (ret != ncclSuccess) { - /* if ret is not ncclInProgress, we just keep it. */ + if (ret == ncclInProgress) { WARN("Attempt to use communicator before the previous operation returned ncclSuccess"); - if (ret == ncclInProgress) ret = ncclInvalidArgument; + ret = ncclInvalidArgument; goto exit; } - /* if there is linked group job, we should complete it. */ - if (comm->groupJob) { - NCCLCHECK(ncclGroupJobComplete(comm->groupJob)); - comm->groupJob = NULL; - } + /* if ret is not ncclInProgress, we just keep it. */ } exit: @@ -357,6 +346,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in // the device we're on (failure cause #1) , better know it early. CUDACHECK(cudaGetDevice(&comm->cudaDev)); + NCCLCHECK(ncclCudaContextTrack(&comm->context)); + NCCLCHECK(getBusId(comm->cudaDev, &comm->busId)); nvmlDevice_t nvmlDev; char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; @@ -396,6 +387,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm->nRanks)); NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->deviceStream)); NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->hostStream)); + CUDACHECK(cudaEventCreateWithFlags(&sharedRes->launchEvent, cudaEventDisableTiming)); + CUDACHECK(cudaEventCreateWithFlags(&sharedRes->scratchEvent, cudaEventDisableTiming)); comm->sharedRes = sharedRes; sharedRes->refCount = 1; } else { @@ -437,13 +430,14 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { struct ncclDevCommAndChannels *devCommAndChans = NULL; struct ncclNvmlCCStatus ccStatus; bool ccEnable; + cudaStream_t deviceStream; - NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail); - NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail); + NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, deviceStream), ret, fail); ncclCommPushCudaFree(comm, devCommAndChans); - NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.rankToLocalRank, comm->nRanks, deviceStream), ret, fail); ncclCommPushCudaFree(comm, tmpCommAndChans.comm.rankToLocalRank); - NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.rankToLocalRank, comm->rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.rankToLocalRank, comm->rankToLocalRank, comm->nRanks, deviceStream), ret, fail); comm->devComm = &devCommAndChans->comm; tmpCommAndChans.comm.rank = comm->rank; tmpCommAndChans.comm.nRanks = nRanks; @@ -494,10 +488,18 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { comm->workFifoConsumedLeast = 0; tmpCommAndChans.comm.workConsumed = comm->workFifoConsumed; + // Alloc profiler counters for the kernel + NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->profiler.workStarted, MAXCHANNELS), ret, fail); + NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->profiler.workCompleted, MAXCHANNELS), ret, fail); + tmpCommAndChans.comm.workStarted = comm->profiler.workStarted; + tmpCommAndChans.comm.workCompleted = comm->profiler.workCompleted; + ncclCommPushCudaHostFree(comm, comm->profiler.workStarted); + ncclCommPushCudaHostFree(comm, comm->profiler.workCompleted); + if (comm->collNetDenseToUserRank != nullptr) { - NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, deviceStream), ret, fail); ncclCommPushCudaFree(comm, tmpCommAndChans.comm.collNetDenseToUserRank); - NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.collNetDenseToUserRank, comm->collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.collNetDenseToUserRank, comm->collNetDenseToUserRank, nRanks, deviceStream), ret, fail); } for (int c=0; c < MAXCHANNELS; c++) { @@ -510,14 +512,14 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { tmpCommAndChans.channels[c].nvls = comm->channels[c].nvls; if (comm->channels[c].ring.userRanks != nullptr) { - NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, deviceStream), ret, fail); } } - NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, deviceStream), ret, fail); exit: + NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false)); NCCLCHECK(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream)); - NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream)); return ret; fail: goto exit; @@ -1000,6 +1002,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter); graphs[a]->crossNic = std::max(allGather3Data[i].graphInfo[a].crossNic, graphs[a]->crossNic); } + comm->maxTreePattern = std::max(comm->maxTreePattern, allGather3Data[i].graphInfo[NCCL_ALGO_TREE].pattern); } if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0; if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0; @@ -1376,12 +1379,12 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { timers[TIMER_INIT_ALLOC] = clockNano(); NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail); timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC]; - // obtain a unique hash for the comm, re-using part of the parent's hash, commHash is a 64bit struct (=16 hex), - // add unique split counter and the color - ncclUniqueId tmpId; - memset(&tmpId,0,sizeof(ncclUniqueId));// must set 0 here to avoid undefined bits - snprintf((char*)&tmpId, NCCL_UNIQUE_ID_BYTES, "%016lx-%d-%d", job->parent->commHash, job->splitCount, job->color); - comm->commHash = getHash(tmpId.internal, NCCL_UNIQUE_ID_BYTES); + // child hash obtained from (parent hash, split count, color) + uint64_t hacc[2] = {1, 1}; + eatHash(hacc, &job->parent->commHash); + eatHash(hacc, &job->splitCount); + eatHash(hacc, &job->color); + comm->commHash = digestHash(hacc); INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d- Init START", job->funcName, comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key); timers[TIMER_INIT_BOOTSTRAP] = clockNano(); @@ -1394,8 +1397,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail); timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC]; // obtain a unique hash using the first commId - comm->commHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES); - commIdHash = hashUniqueId(job->commId[0]); + comm->commHash = commIdHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES); INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", job->funcName, comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash); timers[TIMER_INIT_BOOTSTRAP] = clockNano(); @@ -1610,6 +1612,7 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { NCCL_CONFIG_DEFAULT(internalConfigPtr, maxCTAs, NCCL_CONFIG_UNDEF_INT, MAXCHANNELS, "Max CTAs", "%d"); NCCL_CONFIG_DEFAULT(internalConfigPtr, netName, NCCL_CONFIG_UNDEF_PTR, NULL, "Net name", "%s"); NCCL_CONFIG_DEFAULT(internalConfigPtr, splitShare, NCCL_CONFIG_UNDEF_INT, 0, "Split share", "%d"); + NCCL_CONFIG_DEFAULT(internalConfigPtr, trafficClass, NCCL_CONFIG_UNDEF_INT, NCCL_CONFIG_UNDEF_INT, "Traffic class", "%d"); /* assign config to communicator */ comm->config.blocking = internalConfigPtr->blocking; @@ -1618,6 +1621,7 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { comm->config.maxCTAs = internalConfigPtr->maxCTAs; comm->config.netName = internalConfigPtr->netName; comm->config.splitShare = internalConfigPtr->splitShare; + comm->config.trafficClass = internalConfigPtr->trafficClass; NCCLCHECKGOTO(envConfigOverride(comm), ret, fail); @@ -1642,6 +1646,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId const char* commIdEnv = NULL; ncclComm_t comm = NULL; struct ncclCommInitRankAsyncJob* job = NULL; + bool launchedJob = false; // first call ncclInit, this will setup the environment NCCLCHECKGOTO(ncclInit(), res, fail); @@ -1695,12 +1700,13 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId // start the bootstrap root before bootstrapping, use only the first handle NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&job->commId[0], true), res, fail); } + launchedJob = true; NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, ncclCommInitJobFree, comm), res, fail); exit: return ncclGroupErrCheck(res); fail: - if (job) ncclCommInitJobFree(job); + if (job && !launchedJob) ncclCommInitJobFree(job); if (comm) { free(comm->abortFlag); if (comm->abortFlagDev) (void)ncclCudaHostFree((void*)comm->abortFlagDev); @@ -1896,7 +1902,7 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) { NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), ret, fail); NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail); // And keep polling until all graphs referencing us die. - while (comm->persistentRefs != 0) { + while (comm->localPersistentRefs != 0) { NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail); } while (!ncclIntruQueueEmpty(&comm->legacyRegCleanupQueue)) { @@ -1964,7 +1970,6 @@ ncclResult_t ncclCommFinalize(ncclComm_t comm) { } return ret; fail: - free(job); if (comm && !comm->config.blocking) (void) ncclCommSetAsyncError(comm, ret); goto exit; } @@ -2215,6 +2220,11 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) { *asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE); if (*asyncError == ncclSuccess && comm->proxyState) *asyncError = __atomic_load_n(&comm->proxyState->asyncResult, __ATOMIC_ACQUIRE); + /* if there is linked group job, we should complete it. */ + if (*asyncError == ncclSuccess && comm->groupJob) { + NCCLCHECK(ncclGroupJobComplete(comm->groupJob)); + comm->groupJob = NULL; + } return ncclSuccess; } @@ -2265,16 +2275,13 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) { #if CUDART_VERSION >= 12010 size_t memGran = 0; - size_t mcGran = 0; CUdevice currentDev; CUmemAllocationProp memprop = {}; - CUmulticastObjectProp mcprop = {}; CUmemAccessDesc accessDesc = {}; CUmemGenericAllocationHandle handle; int cudaDev; int flag; int dcnt; - int mcSupport = 0; if (ptr == NULL || size == 0) goto fallback; @@ -2284,6 +2291,7 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) { CUCHECK(cuDeviceGet(¤tDev, cudaDev)); if (ncclCuMemEnable()) { + size_t handleSize = size; int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; // Query device to see if FABRIC handle support is available flag = 0; @@ -2299,40 +2307,25 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) { if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1; CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); CUDACHECK(cudaGetDeviceCount(&dcnt)); - - if (CUPFN(cuMulticastCreate) != NULL) CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev)); - if (mcSupport) { - /* mc property */ - mcprop.size = size; - /* device cnt is a dummy value right now, it might affect mc granularity in the future. */ - mcprop.numDevices = dcnt; - mcprop.handleTypes = requestedHandleTypes; - mcprop.flags = 0; - CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED)); - - /* only size needs to be aligned to mcGran */ - ALIGN_SIZE(size, mcGran); - } else { - ALIGN_SIZE(size, memGran); - } + ALIGN_SIZE(handleSize, memGran); if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) { /* First try cuMemCreate() with FABRIC handle support and then remove if it fails */ - CUresult err = CUPFN(cuMemCreate(&handle, size, &memprop, 0)); + CUresult err = CUPFN(cuMemCreate(&handle, handleSize, &memprop, 0)); if (err == CUDA_ERROR_NOT_PERMITTED || err == CUDA_ERROR_NOT_SUPPORTED) { requestedHandleTypes &= ~CU_MEM_HANDLE_TYPE_FABRIC; memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes; /* Allocate the physical memory on the device */ - CUCHECK(cuMemCreate(&handle, size, &memprop, 0)); + CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0)); } } else { /* Allocate the physical memory on the device */ - CUCHECK(cuMemCreate(&handle, size, &memprop, 0)); + CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0)); } /* Reserve a virtual address range */ - CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, memGran, 0, 0)); + CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, handleSize, memGran, 0, 0)); /* Map the virtual address range to the physical allocation */ - CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); + CUCHECK(cuMemMap((CUdeviceptr)*ptr, handleSize, 0, handle, 0)); /* Now allow RW access to the newly mapped memory */ for (int i = 0; i < dcnt; ++i) { int p2p = 0; @@ -2340,7 +2333,7 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) { accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; accessDesc.location.id = i; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1)); + CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, handleSize, &accessDesc, 1)); } if (0 == p2p && i != cudaDev) INFO(NCCL_ALLOC, "P2P not supported between GPU%d and GPU%d", cudaDev, i); } diff --git a/src/misc/ipcsocket.cc b/src/misc/ipcsocket.cc index 23746b3c5..3e9dfcdb8 100644 --- a/src/misc/ipcsocket.cc +++ b/src/misc/ipcsocket.cc @@ -169,7 +169,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, } control_un; struct cmsghdr *cmptr; - char dummy_buffer[1]; + char dummy_buffer[1] = {'\0'}; struct sockaddr_un cliaddr; // Construct client address to send this shareable handle to @@ -190,6 +190,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d fd %d to UDS socket %s", hdr, hdrLen, sendFd, temp); if (sendFd != -1) { + memset(&control_un, '\0', sizeof(control_un)); msg.msg_control = control_un.control; msg.msg_controllen = sizeof(control_un.control); diff --git a/src/misc/param.cc b/src/misc/param.cc index eb50cfeed..d7c324fe9 100644 --- a/src/misc/param.cc +++ b/src/misc/param.cc @@ -32,6 +32,7 @@ void setEnvFile(const char* fileName) { size_t n = 0; ssize_t read; while ((read = getline(&line, &n, file)) != -1) { + if (line[0] == '#') continue; if (line[read-1] == '\n') line[read-1] = '\0'; int s=0; // Env Var Size while (line[s] != '\0' && line[s] != '=') s++; diff --git a/src/misc/socket.cc b/src/misc/socket.cc index dfb4e6888..731dbcee1 100644 --- a/src/misc/socket.cc +++ b/src/misc/socket.cc @@ -171,6 +171,7 @@ static int findInterfaces(const char* prefixList, char* names, union ncclSocketA strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize); // Store the IP address int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); + memset(addrs+found, '\0', sizeof(*addrs)); memcpy(addrs+found, interface->ifa_addr, salen); found++; } @@ -905,9 +906,17 @@ ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how) { return ncclSuccess; } -ncclResult_t ncclSocketClose(struct ncclSocket* sock) { +ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait) { if (sock != NULL) { if (sock->state > ncclSocketStateNone && sock->state < ncclSocketStateNum && sock->fd >= 0) { + if (wait) { + char data; + int closed = 0; + do { + int offset = 0; + if (ncclSocketProgress(NCCL_SOCKET_RECV, sock, &data, sizeof(char), &offset, &closed) != ncclSuccess) break; + } while (closed == 0); + } /* shutdown() is needed to send FIN packet to proxy thread; shutdown() is not affected * by refcount of fd, but close() is. close() won't close a fd and send FIN packet if * the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful diff --git a/src/misc/strongstream.cc b/src/misc/strongstream.cc index 61b0e4b5b..e6cce9807 100644 --- a/src/misc/strongstream.cc +++ b/src/misc/strongstream.cc @@ -9,28 +9,61 @@ #include "checks.h" #include "param.h" -// Tracks the chain of graph nodes for a given graph captured identified by -// its graph id. This state has to live for as long as captured work is being -// submitted. CUDA doesn't have mechanism to inform us when the user ends capture -// so the best we can do is get notified when the graph is destroyed. -struct ncclStrongStreamGraph { - struct ncclStrongStreamGraph* next; - // Atomically exchanged to false by both the main thread or the graph destructor - // callback. The last to arrive deletes the node. - bool alive; +// Tracks the captured work a given graph captured identified by its graph id. +struct ncclStrongStreamCapture { + struct ncclStrongStreamCapture* next; + cudaGraph_t graph; unsigned long long graphId; - // For each graph we track the "tip" of the chain of graph nodes. A linear - // chain would always have just one node at its tip, but since we have to merge - // in chains from other streams (via ncclStrongStreamWaitStream) some spots - // in the chain can be wider than a single node and thus need a list, so we - // maintain a dynamically sized array of tip nodes. - int tipCount, tipCapacity; - cudaGraphNode_t* tipNodes; + cudaStream_t captureStream; + cudaGraphNode_t lastRecord; + void* acquiredBy; }; -static void ncclStrongStreamGraphDelete(struct ncclStrongStreamGraph* g) { - free(g->tipNodes); - free(g); +//////////////////////////////////////////////////////////////////////////////// + +static ncclCudaContext* cxtListHead = nullptr; +static pthread_mutex_t cxtListLock = PTHREAD_MUTEX_INITIALIZER; + +ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out) { + ncclResult_t result = ncclSuccess; + CUcontext hcontext; + CUCHECK(cuCtxGetCurrent(&hcontext)); + + pthread_mutex_lock(&cxtListLock); + struct ncclCudaContext* p = cxtListHead; + while (1) { + if (p == nullptr) { + p = (struct ncclCudaContext*)calloc(1, sizeof(struct ncclCudaContext)); + p->refCount = 1; + p->hcontext = hcontext; + p->next = cxtListHead; + cxtListHead = p; + NCCLCHECKGOTO(ncclStrongStreamConstruct(&p->launchOrder), result, leave); + break; + } + if (p->hcontext == hcontext) { + p->refCount += 1; + break; + } + p = p->next; + } +leave: + pthread_mutex_unlock(&cxtListLock); + *out = p; + return ncclSuccess; +} + +void ncclCudaContextDrop(struct ncclCudaContext* cxt) { + pthread_mutex_lock(&cxtListLock); + if (0 == --cxt->refCount) { + struct ncclCudaContext** pp = &cxtListHead; + while (*pp != cxt) pp = &(*pp)->next; + *pp = cxt->next; // remove from list + // Destroy resources held in cxt + ncclStrongStreamDestruct(&cxt->launchOrder); + free(cxt); + } + pthread_mutex_unlock(&cxtListLock); } //////////////////////////////////////////////////////////////////////////////// @@ -43,9 +76,9 @@ ncclResult_t ncclCudaGetCapturingGraph( NCCLCHECK(ncclCudaDriverVersion(&driver)); if (CUDART_VERSION < 11030 || driver < 11030) { cudaStreamCaptureStatus status; - unsigned long long gid; - CUDACHECK(cudaStreamGetCaptureInfo(stream, &status, &gid)); + CUDACHECK(cudaStreamGetCaptureInfo(stream, &status, nullptr)); #if CUDART_VERSION >= 11030 + graph->origin = nullptr; graph->graph = nullptr; graph->graphId = ULLONG_MAX; #endif @@ -56,13 +89,14 @@ ncclResult_t ncclCudaGetCapturingGraph( } else { #if CUDART_VERSION >= 11030 cudaStreamCaptureStatus status; - unsigned long long gid; - CUDACHECK(cudaStreamGetCaptureInfo_v2(stream, &status, &gid, &graph->graph, nullptr, nullptr)); + CUDACHECK(cudaStreamGetCaptureInfo_v2(stream, &status, &graph->graphId, &graph->graph, nullptr, nullptr)); if (status != cudaStreamCaptureStatusActive) { + graph->origin = nullptr; graph->graph = nullptr; - gid = ULLONG_MAX; + graph->graphId = ULLONG_MAX; + } else { + graph->origin = stream; } - graph->graphId = gid; #endif } #endif @@ -86,315 +120,218 @@ ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t //////////////////////////////////////////////////////////////////////////////// ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss) { - CUDACHECK(cudaStreamCreateWithFlags(&ss->cudaStream, cudaStreamNonBlocking)); + CUDACHECK(cudaStreamCreateWithFlags(&ss->liveStream, cudaStreamNonBlocking)); #if CUDART_VERSION >= 11030 - CUDACHECK(cudaEventCreateWithFlags(&ss->serialEvent, cudaEventDisableTiming)); ss->everCaptured = false; - ss->serialEventNeedsRecord = false; - ss->graphHead = nullptr; - #else - CUDACHECK(cudaEventCreateWithFlags(&ss->scratchEvent, cudaEventDisableTiming)); + ss->captureHead = nullptr; + pthread_mutex_init(&ss->lock, nullptr); + CUDACHECK(cudaEventCreateWithFlags(&ss->serialEvent, cudaEventDisableTiming)); #endif return ncclSuccess; } -static void graphDestructor(void* arg) { - struct ncclStrongStreamGraph* g = (struct ncclStrongStreamGraph*)arg; - if (false == __atomic_exchange_n(&g->alive, false, __ATOMIC_ACQ_REL)) { - // Last to arrive deletes list node. - ncclStrongStreamGraphDelete(g); - } -} - ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss) { - CUDACHECK(cudaStreamDestroy(ss->cudaStream)); + CUDACHECK(cudaStreamDestroy(ss->liveStream)); #if CUDART_VERSION >= 11030 - CUDACHECK(cudaEventDestroy(ss->serialEvent)); - // Delete list of per-graph chains. - struct ncclStrongStreamGraph* g = ss->graphHead; - while (g != nullptr) { - struct ncclStrongStreamGraph* next = g->next; - if (false == __atomic_exchange_n(&g->alive, false, __ATOMIC_ACQ_REL)) { - // Last to arrive deletes list node. - ncclStrongStreamGraphDelete(g); - } - g = next; + struct ncclStrongStreamCapture* cap = ss->captureHead; + while (cap) { + struct ncclStrongStreamCapture* next = cap->next; + CUDACHECK(cudaStreamDestroy(cap->captureStream)); + free(cap); + cap = next; } - #else - CUDACHECK(cudaEventDestroy(ss->scratchEvent)); + CUDACHECK(cudaEventDestroy(ss->serialEvent)); + pthread_mutex_destroy(&ss->lock); #endif return ncclSuccess; } NCCL_PARAM(GraphMixingSupport, "GRAPH_MIXING_SUPPORT", 1) +NCCL_PARAM(LaunchRaceFatal, "LAUNCH_RACE_FATAL", 1); +constexpr char const* launchRaceFatalMsg = "Fatal: host threads racing to launch NCCL on same device."; -static void ensureTips(struct ncclStrongStreamGraph* g, int n) { - if (g->tipCapacity < n) { - g->tipNodes = (cudaGraphNode_t*)realloc(g->tipNodes, n*sizeof(cudaGraphNode_t)); - g->tipCapacity = n; - } -} +static __thread char threadIdMarker; +static void* localThreadId() { return &threadIdMarker; } ncclResult_t ncclStrongStreamAcquire( - struct ncclCudaGraph graph, struct ncclStrongStream* ss + struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent, + cudaStream_t* workStream ) { #if CUDART_VERSION >= 11030 bool mixing = ncclParamGraphMixingSupport(); - if (graph.graph == nullptr) { - if (mixing && ss->everCaptured) { - CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0)); - ss->serialEventNeedsRecord = false; + if (graph.graphId == ULLONG_MAX) { + *workStream = ss->liveStream; + ss->liveAcquiredBy = localThreadId(); + if (mixing && __atomic_load_n(&ss->everCaptured, __ATOMIC_RELAXED)) { + CUDACHECK(cudaStreamWaitEvent(ss->liveStream, ss->serialEvent, 0)); } } else { - ss->everCaptured = true; - // Find the current graph in our list of graphs if it exists. - struct ncclStrongStreamGraph** pg = &ss->graphHead; - struct ncclStrongStreamGraph* g; - while (*pg != nullptr) { - g = *pg; - if (g->graphId == graph.graphId) { - // Move to front of list so that operations after acquire don't have to search the list. - *pg = g->next; - g->next = ss->graphHead; - ss->graphHead = g; + bool firstCapture = !ss->everCaptured; + __atomic_store_n(&ss->everCaptured, true, __ATOMIC_RELAXED); + + ncclResult_t ret = ncclSuccess; + if (concurrent) pthread_mutex_lock(&ss->lock); + + // Look for capture in our list of active captures. + struct ncclStrongStreamCapture** pcap = &ss->captureHead; + struct ncclStrongStreamCapture* cap; + struct ncclStrongStreamCapture* spare = nullptr; + while (*pcap != nullptr) { + cap = *pcap; + if (cap->graphId == graph.graphId) { // Capture node already exists. + *workStream = cap->captureStream; + cap->acquiredBy = localThreadId(); + if (concurrent) pthread_mutex_unlock(&ss->lock); return ncclSuccess; - } else if (false == __atomic_load_n(&g->alive, __ATOMIC_ACQUIRE)) { - // Unrelated graph that has been destroyed. Remove and delete. - *pg = g->next; - ncclStrongStreamGraphDelete(g); } else { - pg = &g->next; + cudaStreamCaptureStatus status; + CUDACHECKGOTO(cudaStreamIsCapturing(cap->captureStream, &status), ret, do_unlock); + if (status == cudaStreamCaptureStatusActive) { + pcap = &cap->next; // Active capture doesn't match, on to next. + } else { // Capture no longer active + *pcap = cap->next; // Remove from current list + if (spare == nullptr) { // Keep one spare to reuse below. + spare = cap; + } else { + cudaStreamDestroy(cap->captureStream); + free(cap); + } + } } } - - // This is a new graph so add to the list. - g = (struct ncclStrongStreamGraph*)malloc(sizeof(struct ncclStrongStreamGraph)); - g->graphId = graph.graphId; - g->tipNodes = nullptr; - g->tipCapacity = 0; - g->tipCount = 0; - g->next = ss->graphHead; - ss->graphHead = g; - g->alive = true; - NCCLCHECK(ncclCudaGraphAddDestructor(graph, graphDestructor, (void*)g)); - - if (mixing && ss->serialEventNeedsRecord) { - // Can only be here if previous release was for uncaptured work that - // elided updating the event because no capture had yet occurred. - CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0)); - CUDACHECK(cudaEventRecord(ss->serialEvent, ss->cudaStream)); + // No matching capture, need a new entry. + cap = spare; + if (cap == nullptr) { + cap = (struct ncclStrongStreamCapture*)calloc(1, sizeof(struct ncclStrongStreamCapture)); + CUDACHECKGOTO(cudaStreamCreateWithFlags(&cap->captureStream, cudaStreamNonBlocking), ret, do_unlock); } - ss->serialEventNeedsRecord = false; + cap->graphId = graph.graphId; + cap->lastRecord = nullptr; + cap->acquiredBy = localThreadId(); + // Push to capturing list. + cap->next = ss->captureHead; + ss->captureHead = cap; - // First node in the chain must be a wait on the serialEvent. - if (mixing) { - ensureTips(g, 1); - CUDACHECK(cudaGraphAddEventWaitNode(&g->tipNodes[0], graph.graph, nullptr, 0, ss->serialEvent)); - g->tipCount = 1; - } else { - g->tipCount = 0; - } - } - #endif - return ncclSuccess; -} + do_unlock: + if (concurrent) pthread_mutex_unlock(&ss->lock); + if (ret != ncclSuccess) return ret; -ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss) { - #if CUDART_VERSION >= 11030 - bool mixing = ncclParamGraphMixingSupport(); - if (mixing && ss->everCaptured) { - CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0)); - } - ss->serialEventNeedsRecord = true; // Assume the caller is going to add work to stream. - #endif - return ncclSuccess; -} + *workStream = cap->captureStream; -static ncclResult_t checkGraphId(struct ncclStrongStreamGraph* g, unsigned long long id) { - if (g == nullptr || g->graphId != id) { - WARN("Expected graph id=%llu was not at head of strong stream's internal list.", id); - return ncclInternalError; - } - return ncclSuccess; -} + // Bring captureStream into the graph but without any dependencies. + cudaEvent_t scratch; + CUDACHECK(cudaEventCreateWithFlags(&scratch, cudaEventDisableTiming)); + CUDACHECK(cudaEventRecord(scratch, graph.origin)); + CUDACHECK(cudaStreamWaitEvent(cap->captureStream, scratch, 0)); + CUDACHECK(cudaEventDestroy(scratch)); + CUDACHECK(cudaStreamUpdateCaptureDependencies(cap->captureStream, nullptr, 0, cudaStreamSetCaptureDependencies)); -ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss) { - #if CUDART_VERSION >= 11030 - bool mixing = ncclParamGraphMixingSupport(); - if (mixing && ss->serialEventNeedsRecord) { - if (graph.graph == nullptr) { - if (ss->everCaptured) { - CUDACHECK(cudaEventRecord(ss->serialEvent, ss->cudaStream)); - ss->serialEventNeedsRecord = false; - } - } else { - struct ncclStrongStreamGraph* g = ss->graphHead; - NCCLCHECK(checkGraphId(g, graph.graphId)); - ensureTips(g, 1); - CUDACHECK(cudaGraphAddEventRecordNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, ss->serialEvent)); - g->tipCount = 1; - ss->serialEventNeedsRecord = false; + if (mixing && firstCapture) { + CUDACHECK(cudaEventRecord(ss->serialEvent, ss->liveStream)); + } + if (mixing) { + // First dependency is to wait on serialEvent + CUDACHECK(cudaStreamWaitEvent(cap->captureStream, ss->serialEvent, cudaEventWaitExternal)); } } #endif return ncclSuccess; } -ncclResult_t ncclStrongStreamLaunchHost( - struct ncclCudaGraph graph, struct ncclStrongStream* ss, cudaHostFn_t fn, void* arg +ncclResult_t ncclStrongStreamAcquiredWorkStream( + struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent, + cudaStream_t* workStream ) { #if CUDART_VERSION >= 11030 - if (graph.graph == nullptr) { - CUDACHECK(cudaLaunchHostFunc(ss->cudaStream, fn, arg)); + if (graph.graphId == ULLONG_MAX) { + *workStream = ss->liveStream; } else { - cudaHostNodeParams p; - p.fn = fn; - p.userData = arg; - struct ncclStrongStreamGraph* g = ss->graphHead; - NCCLCHECK(checkGraphId(g, graph.graphId)); - ensureTips(g, 1); - CUDACHECK(cudaGraphAddHostNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, &p)); - g->tipCount = 1; + if (concurrent) pthread_mutex_lock(&ss->lock); + struct ncclStrongStreamCapture* cap = ss->captureHead; + while (cap->graphId != graph.graphId) cap = cap->next; + *workStream = cap->captureStream; + if (concurrent) pthread_mutex_unlock(&ss->lock); } - ss->serialEventNeedsRecord = true; #else - CUDACHECK(cudaLaunchHostFunc(ss->cudaStream, fn, arg)); + *workStream = ss->liveStream #endif return ncclSuccess; } -ncclResult_t ncclStrongStreamLaunchKernel( - struct ncclCudaGraph graph, struct ncclStrongStream* ss, - void* fn, dim3 grid, dim3 block, void* args[], size_t sharedMemBytes +ncclResult_t ncclStrongStreamRelease( + struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent ) { #if CUDART_VERSION >= 11030 - if (graph.graph == nullptr) { - CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->cudaStream)); - } else { - cudaKernelNodeParams p; - p.func = fn; - p.gridDim = grid; - p.blockDim = block; - p.kernelParams = args; - p.sharedMemBytes = sharedMemBytes; - p.extra = nullptr; - struct ncclStrongStreamGraph* g = ss->graphHead; - NCCLCHECK(checkGraphId(g, graph.graphId)); - ensureTips(g, 1); - CUDACHECK(cudaGraphAddKernelNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, &p)); - g->tipCount = 1; - } - ss->serialEventNeedsRecord = true; - #else - CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->cudaStream)); - #endif - return ncclSuccess; -} + bool mixing = ncclParamGraphMixingSupport(); + if (mixing) { + if (graph.graphId == ULLONG_MAX) { + if (__atomic_load_n(&ss->everCaptured, __ATOMIC_RELAXED)) { + CUDACHECK(cudaEventRecord(ss->serialEvent, ss->liveStream)); + } + if (ss->liveAcquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) { + WARN("%s", launchRaceFatalMsg); + return ncclInvalidUsage; + } + } else { + if (concurrent) pthread_mutex_lock(&ss->lock); + struct ncclStrongStreamCapture* cap = ss->captureHead; + while (cap->graphId != graph.graphId) cap = cap->next; + if (concurrent) pthread_mutex_unlock(&ss->lock); -// Merge node list `b` into list `a` but don't add duplicates. -static void mergeTips(struct ncclStrongStreamGraph* a, cudaGraphNode_t const* bNodes, int bn) { - int an = a->tipCount; - ensureTips(a, an + bn); - for (int bi=0; bi < bn; bi++) { - for (int ai=0; ai < an; ai++) { - if (a->tipNodes[ai] == bNodes[bi]) goto next_b; - } - a->tipNodes[a->tipCount++] = bNodes[bi]; - next_b:; - } -} + // Add event record node with dependencies added further down. + cudaGraphNode_t recordNode; + CUDACHECK(cudaGraphAddEventRecordNode(&recordNode, graph.graph, nullptr, 0, ss->serialEvent)); -ncclResult_t ncclStrongStreamWaitStream( - struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b, - bool b_subsumes_a - ) { - #if CUDART_VERSION >= 11030 - if (graph.graph == nullptr) { - if (b->serialEventNeedsRecord) { - b->serialEventNeedsRecord = false; - CUDACHECK(cudaEventRecord(b->serialEvent, b->cudaStream)); - } - CUDACHECK(cudaStreamWaitEvent(a->cudaStream, b->serialEvent, 0)); - } else { - struct ncclStrongStreamGraph* ag = a->graphHead; - NCCLCHECK(checkGraphId(ag, graph.graphId)); - struct ncclStrongStreamGraph* bg = b->graphHead; - NCCLCHECK(checkGraphId(bg, graph.graphId)); - if (b_subsumes_a) ag->tipCount = 0; - mergeTips(ag, bg->tipNodes, bg->tipCount); - } - a->serialEventNeedsRecord = true; - #else - CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream)); - CUDACHECK(cudaStreamWaitEvent(a->cudaStream, b->scratchEvent, 0)); - #endif - return ncclSuccess; -} + // Make this record order after previous record on this stream. + if (cap->lastRecord != nullptr) { + CUDACHECK(cudaGraphAddDependencies(graph.graph, &cap->lastRecord, &recordNode, 1)); + } + cap->lastRecord = recordNode; -ncclResult_t ncclStrongStreamWaitStream( - struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b, - bool b_subsumes_a - ) { - #if CUDART_VERSION >= 11030 - if (graph.graph == nullptr) { - // It is ok to use a->serialEvent to record b since we'll be setting - // a->serialEventNeedsRecord so the event won't be considered accurate - // until re-recorded. - CUDACHECK(cudaEventRecord(a->serialEvent, b)); - CUDACHECK(cudaStreamWaitEvent(a->cudaStream, a->serialEvent, 0)); - } else { - cudaStreamCaptureStatus status; - unsigned long long bGraphId; - cudaGraphNode_t const* bNodes; - size_t bCount = 0; - CUDACHECK(cudaStreamGetCaptureInfo_v2(b, &status, &bGraphId, nullptr, &bNodes, &bCount)); - if (status != cudaStreamCaptureStatusActive || graph.graphId != bGraphId) { - WARN("Stream is not being captured by the expected graph."); - return ncclInvalidUsage; + // Get current nodes from work stream so we can add them as dependencies. + cudaStreamCaptureStatus status; + cudaGraphNode_t const* nodes; + size_t count = 0; + cudaError_t res = cudaStreamGetCaptureInfo_v2(cap->captureStream, &status, nullptr, nullptr, &nodes, &count); + + #if CUDART_VERSION >= 12030 + if (res == cudaErrorLossyQuery) { // CUDA is telling us the dependencies have edge annotations. + cudaGraphEdgeData const* edges; + CUDACHECK(cudaStreamGetCaptureInfo_v3(cap->captureStream, &status, nullptr, nullptr, &nodes, &edges, &count)); + for (int i=0; i < (int)count; i++) { + CUDACHECK(cudaGraphAddDependencies_v2(graph.graph, &nodes[i], &recordNode, &edges[i], 1)); + } + } + #else + if (false) {} + #endif + else { + CUDACHECK(res /* = cudaStreamGetCaptureInfo_v2(...)*/); + for (int i=0; i < (int)count; i++) { + CUDACHECK(cudaGraphAddDependencies(graph.graph, &nodes[i], &recordNode, 1)); + } + } + + if (cap->acquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) { + WARN("%s", launchRaceFatalMsg); + return ncclInvalidUsage; + } } - struct ncclStrongStreamGraph* ag = a->graphHead; - NCCLCHECK(checkGraphId(ag, graph.graphId)); - if (b_subsumes_a) ag->tipCount = 0; - mergeTips(ag, bNodes, bCount); } - a->serialEventNeedsRecord = true; - #else - CUDACHECK(cudaEventRecord(a->scratchEvent, b)); - CUDACHECK(cudaStreamWaitEvent(a->cudaStream, a->scratchEvent, 0)); #endif return ncclSuccess; } -ncclResult_t ncclStrongStreamWaitStream( - struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b, - bool b_subsumes_a - ) { - #if CUDART_VERSION >= 11030 - if (graph.graph == nullptr) { - if (b->serialEventNeedsRecord) { - b->serialEventNeedsRecord = false; - CUDACHECK(cudaEventRecord(b->serialEvent, b->cudaStream)); - } - CUDACHECK(cudaStreamWaitEvent(a, b->serialEvent, 0)); - } else { - struct ncclStrongStreamGraph* bg = b->graphHead; - NCCLCHECK(checkGraphId(bg, graph.graphId)); - CUDACHECK(cudaStreamUpdateCaptureDependencies(a, bg->tipNodes, bg->tipCount, - b_subsumes_a ? cudaStreamSetCaptureDependencies : cudaStreamAddCaptureDependencies - )); - } - #else - CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream)); - CUDACHECK(cudaStreamWaitEvent(a, b->scratchEvent, 0)); - #endif +ncclResult_t ncclStreamWaitStream(cudaStream_t a, cudaStream_t b, cudaEvent_t scratchEvent) { + CUDACHECK(cudaEventRecord(scratchEvent, b)); + CUDACHECK(cudaStreamWaitEvent(a, scratchEvent, 0)); return ncclSuccess; } ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss) { #if CUDART_VERSION >= 11030 - CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0)); - ss->serialEventNeedsRecord = false; + CUDACHECK(cudaStreamWaitEvent(ss->liveStream, ss->serialEvent, 0)); #endif - CUDACHECK(cudaStreamSynchronize(ss->cudaStream)); + CUDACHECK(cudaStreamSynchronize(ss->liveStream)); return ncclSuccess; } diff --git a/src/misc/tuner.cc b/src/misc/tuner.cc deleted file mode 100644 index 267e12a03..000000000 --- a/src/misc/tuner.cc +++ /dev/null @@ -1,267 +0,0 @@ -/************************************************************************* - * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include -#include -#include - -#include "checks.h" -#include "debug.h" -#include "tuner.h" - -pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER; -static int tunerPluginRefCount; -static void* tunerPluginLib = nullptr; -static ncclTuner_v4_t* tunerSymbol = nullptr; -static ncclTuner_v3_t* ncclTuner_v3 = nullptr; -static ncclTuner_v2_t* ncclTuner_v2 = nullptr; -static ncclTuner_v4_t ncclTuner_v2_as_v4; -static ncclTuner_v4_t ncclTuner_v3_as_v4; - -static int hasNvlsSupport(float** collCostTable) { - // Requirements for support of different algorithms: - // - // - NVLS intra-node: nvlsSupport - // - NVLS intra+inter-node: collNetSupport - // - NVLSTree intra-node: always disabled - // - NVLSTree inter-node: nvlsSupport - // - Collnet* inter-node: collNetSupport - // - // nvlsSupport = 1 if either NVLS or NVLS_TREE entries in the cost table are not -1 - float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; - return (table[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE || table[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) ? 1 : 0; -} - -static int hasCollNetSupport(float** collCostTable) { - float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; - return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1; -} - -static ncclResult_t ncclTuner_v3_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int regBuff __attribute__((unused)), int* nChannels) { - NCCLCHECK(ncclTuner_v3->getCollInfo(context, collType, nBytes, numPipeOps, collCostTable, numAlgo, numProto, nChannels)); - return ncclSuccess; -} - -static ncclResult_t ncclTuner_v3_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) { - NCCLCHECK(ncclTuner_v3->init(nRanks, nNodes, logFunction, context)); - ncclTuner_v3_as_v4.name = ncclTuner_v3->name; - ncclTuner_v3_as_v4.getCollInfo = ncclTuner_v3_as_v4_getCollInfo; - ncclTuner_v3_as_v4.destroy = ncclTuner_v3->destroy; - return ncclSuccess; -} - -static ncclResult_t ncclTuner_v2_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int regBuff __attribute__((unused)), int* nChannels) { - int algorithm = NCCL_ALGO_UNDEF; - int protocol = NCCL_PROTO_UNDEF; - int nvlsSupport = hasNvlsSupport(collCostTable); - int collNetSupport = hasCollNetSupport(collCostTable); - NCCLCHECK(ncclTuner_v2->getCollInfo(context, collType, nBytes, collNetSupport, nvlsSupport, numPipeOps, &algorithm, &protocol, nChannels)); - // set time to 0 below to make sure this algorithm/protocol is selected later on - if (algorithm >= 0 && algorithm < NCCL_NUM_ALGORITHMS && protocol >= 0 && protocol < NCCL_NUM_PROTOCOLS) { - float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; - if (table[algorithm][protocol] != NCCL_ALGO_PROTO_IGNORE) table[algorithm][protocol] = 0.0; - } - return ncclSuccess; -} - -static ncclResult_t ncclTuner_v2_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) { - NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logFunction, context)); - ncclTuner_v2_as_v4.name = ncclTuner_v2->name; - ncclTuner_v2_as_v4.getCollInfo = ncclTuner_v2_as_v4_getCollInfo; - ncclTuner_v2_as_v4.destroy = ncclTuner_v2->destroy; - return ncclSuccess; -} - -#define MAX_STR_LEN 255 - -static void* tryOpenLib(const char* name, int* err, char* errStr) { - *err = 0; - if (nullptr == name || strlen(name) == 0) { - return nullptr; - } - - if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) { - name = nullptr; - } - - void *handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL); - if (nullptr == handle) { - strncpy(errStr, dlerror(), MAX_STR_LEN); - errStr[MAX_STR_LEN] = '\0'; - // "handle" and "name" won't be NULL at the same time. - // coverity[var_deref_model] - if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) { - *err = ENOENT; - } - } - return handle; -} - -static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) { - if (openErr == ENOENT) { - snprintf(nameList, *nameListLen, " %s", name); - nameList += strlen(name) + 1; - *nameListLen -= strlen(name) + 1; - return nameList; - } - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: %s", openErrStr); - return nameList; -} - -static void* openTunerPluginLib(char* couldNotFindNames, int len) { - int openErr; - void *pluginLib; - char tunerPluginLibName[PATH_MAX]; - char openErrStr[MAX_STR_LEN + 1] = { 0 }; - const char *envTunerPluginName = getenv("NCCL_TUNER_PLUGIN"); - if (envTunerPluginName && strlen(envTunerPluginName)) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: NCCL_TUNER_PLUGIN set to %s", envTunerPluginName); - snprintf(tunerPluginLibName, PATH_MAX, "%s", envTunerPluginName); - pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); - if (pluginLib) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName); - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); - - snprintf(tunerPluginLibName, PATH_MAX, "libnccl-tuner-%s.so", envTunerPluginName); - pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); - if (pluginLib) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName); - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); - } else { - snprintf(tunerPluginLibName, PATH_MAX, "libnccl-tuner.so"); - pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); - if (pluginLib) { - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); - } - - const char *envNetPluginName = getenv("NCCL_NET_PLUGIN"); - if (envNetPluginName && strlen(envNetPluginName)) { - // Users are allowed to pack tuner into the net plugin - snprintf(tunerPluginLibName, PATH_MAX, "%s", envNetPluginName); - pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); - if (pluginLib) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName); - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); - - snprintf(tunerPluginLibName, PATH_MAX, "libnccl-net-%s.so", envNetPluginName); - pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); - if (pluginLib) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName); - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); - } else { - snprintf(tunerPluginLibName, PATH_MAX, "libnccl-net.so"); - pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr); - if (pluginLib) { - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName); - } - tunerPluginLibName[0] = '\0'; - return nullptr; -} - -enum { - tunerPluginLoadFailed = -1, - tunerPluginLoadReady = 0, - tunerPluginLoadSuccess = 1, -}; - -#define MAX_PLUGIN_LOAD 4 - -static int status = tunerPluginLoadReady; - -ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) { - // Initialize to nullptr by default if plugin tuner cannot be loaded. - char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 }; - comm->tuner = nullptr; - if (tunerPluginLoadFailed == status) { - return ncclSuccess; - } - - pthread_mutex_lock(&tunerPluginLock); - if (tunerPluginLoadFailed == status) { - goto exit; - } - - if (tunerPluginLoadSuccess == status) { - comm->tuner = tunerSymbol; - ++tunerPluginRefCount; - goto exit; - } - - tunerPluginLib = openTunerPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX); - if (nullptr == tunerPluginLib) { - if (strlen(couldNotFindNames)) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Could not find:%s. Using internal tuner plugin.", couldNotFindNames); - } else { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using internal tuner plugin."); - } - goto fail; - } - - tunerSymbol = (ncclTuner_v4_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v4"); - if (tunerSymbol == nullptr) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol."); - ncclTuner_v3 = (ncclTuner_v3_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v3"); - if (ncclTuner_v3 == nullptr) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol."); - ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v2"); - if (ncclTuner_v2 == nullptr) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead."); - dlclose(tunerPluginLib); - goto fail; - } else { - ncclTuner_v2_as_v4.init = ncclTuner_v2_as_v4_init; - ncclTuner_v2_as_v4.name = ncclTuner_v2->name; - tunerSymbol = &ncclTuner_v2_as_v4; - } - } else { - ncclTuner_v3_as_v4.init = ncclTuner_v3_as_v4_init; - ncclTuner_v3_as_v4.name = ncclTuner_v3->name; - tunerSymbol = &ncclTuner_v3_as_v4; - } - } - - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", tunerSymbol->name); - comm->tuner = tunerSymbol; - ++tunerPluginRefCount; - status = tunerPluginLoadSuccess; - comm->tunerPluginLoaded = 1; - -exit: - pthread_mutex_unlock(&tunerPluginLock); - return ncclSuccess; -fail: - tunerPluginLib = nullptr; - status = tunerPluginLoadFailed; - goto exit; -} - -ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) { - pthread_mutex_lock(&tunerPluginLock); - if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) { - INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name); - dlclose(tunerPluginLib); - tunerPluginLib = nullptr; - tunerSymbol = nullptr; - comm->tuner = nullptr; - status = tunerPluginLoadReady; - comm->tunerPluginLoaded = 0; - } - pthread_mutex_unlock(&tunerPluginLock); - return ncclSuccess; -} diff --git a/src/nccl.h.in b/src/nccl.h.in index 8a6f94e24..f3ab5344f 100644 --- a/src/nccl.h.in +++ b/src/nccl.h.in @@ -66,6 +66,7 @@ typedef struct ncclConfig_v21700 { int maxCTAs; const char *netName; int splitShare; + int trafficClass; } ncclConfig_t; /* Config initializer must be assigned to initialize config structure when it is created. @@ -79,7 +80,8 @@ typedef struct ncclConfig_v21700 { NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \ NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \ NCCL_CONFIG_UNDEF_PTR, /* netName */ \ - NCCL_CONFIG_UNDEF_INT /* splitShare */ \ + NCCL_CONFIG_UNDEF_INT, /* splitShare */ \ + NCCL_CONFIG_UNDEF_INT, /* trafficClass */ \ } /* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */ diff --git a/src/net.cc b/src/net.cc deleted file mode 100644 index 13e8c2b51..000000000 --- a/src/net.cc +++ /dev/null @@ -1,1033 +0,0 @@ -/************************************************************************* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "net.h" -#include "bootstrap.h" -#include "checks.h" - -#include -#include -#include -//#include -//#include -//#include - -static ncclNet_v9_t ncclNet_v5_as_v9; -static ncclNet_v9_t ncclNet_v6_as_v9; -static ncclNet_v9_t ncclNet_v7_as_v9; -static ncclNet_v9_t ncclNet_v8_as_v9; -static ncclNet_v5_t *ncclNet_v5; -static ncclNet_v6_t *ncclNet_v6; -static ncclNet_v7_t *ncclNet_v7; -static ncclNet_v8_t *ncclNet_v8; -static ncclCollNet_v9_t ncclCollNet_v5_as_v9; -static ncclCollNet_v9_t ncclCollNet_v6_as_v9; -static ncclCollNet_v9_t ncclCollNet_v7_as_v9; -static ncclCollNet_v9_t ncclCollNet_v8_as_v9; -static ncclCollNet_v5_t *ncclCollNet_v5; -static ncclCollNet_v6_t *ncclCollNet_v6; -static ncclCollNet_v7_t *ncclCollNet_v7; -static ncclCollNet_v8_t *ncclCollNet_v8; - -#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two. -#define MAX_COLLNET_SIZE (512*1024*1024L) //Set for initial collent plugins when size was not dynamically queried - -static ncclResult_t ncclNet_v8_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { - ncclNetProperties_v8_t p8; - ncclResult_t ans = ncclNet_v8->getProperties(dev, &p8); - if (ans != ncclSuccess) return ans; - props->name = p8.name; - props->pciPath = p8.pciPath; - props->guid = p8.guid; - props->ptrSupport = p8.ptrSupport; - props->regIsGlobal = p8.regIsGlobal; - props->forceFlush = 0; - props->speed = p8.speed; - props->port = p8.port; - props->maxComms = p8.maxComms; - props->maxRecvs = p8.maxRecvs; - props->latency = p8.latency; - props->netDeviceType = p8.netDeviceType; - props->netDeviceVersion = p8.netDeviceVersion; - props->vProps.ndevs = 1; - props->vProps.devs[0] = dev; - props->maxP2pBytes = MAX_NET_SIZE; - props->maxCollBytes = MAX_COLLNET_SIZE; - return ncclSuccess; -} - -static ncclResult_t ncclNet_v8_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { - int sizeInt; - if (size > MAX_NET_SIZE) return ncclInternalError; - sizeInt = (int)size; - ncclResult_t ans = ncclNet_v8->isend(sendComm, data, sizeInt, tag, mhandle, request); - return ans; -} - -static ncclResult_t ncclNet_v8_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { - int sizesInt[NCCL_PROXY_MAX_SUBS]; - //reset to NULL if optional receive completion is set - if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL; - for (int i=0; i MAX_NET_SIZE) return ncclInternalError; - sizesInt[i] = (int) sizes[i]; - } - ncclResult_t ans = ncclNet_v8->irecv(recvComm, n, data, sizesInt, tags, mhandles, request); - return ans; -} - -static ncclResult_t ncclNet_v8_as_v9_init(ncclDebugLogger_t logfn) { - NCCLCHECK(ncclNet_v8->init(logfn)); - ncclNet_v8_as_v9.name = ncclNet_v8->name; - ncclNet_v8_as_v9.devices = ncclNet_v8->devices; - ncclNet_v8_as_v9.getProperties = ncclNet_v8_as_v9_getProperties; - ncclNet_v8_as_v9.listen = ncclNet_v8->listen; - ncclNet_v8_as_v9.connect = ncclNet_v8->connect; - ncclNet_v8_as_v9.accept = ncclNet_v8->accept; - ncclNet_v8_as_v9.regMr = ncclNet_v8->regMr; - ncclNet_v8_as_v9.regMrDmaBuf = ncclNet_v8->regMrDmaBuf; - ncclNet_v8_as_v9.deregMr = ncclNet_v8->deregMr; - ncclNet_v8_as_v9.isend = ncclNet_v8_as_v9_isend; - ncclNet_v8_as_v9.irecv = ncclNet_v8_as_v9_irecv; - ncclNet_v8_as_v9.iflush = ncclNet_v8->iflush; - ncclNet_v8_as_v9.test = ncclNet_v8->test; - ncclNet_v8_as_v9.closeSend = ncclNet_v8->closeSend; - ncclNet_v8_as_v9.closeRecv = ncclNet_v8->closeRecv; - ncclNet_v8_as_v9.closeListen = ncclNet_v8->closeListen; - ncclNet_v8_as_v9.getDeviceMr = ncclNet_v8->getDeviceMr; - ncclNet_v8_as_v9.irecvConsumed = ncclNet_v8->irecvConsumed; - ncclNet_v8_as_v9.makeVDevice = NULL; - return ncclSuccess; -} - -static ncclResult_t ncclNet_v7_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { - ncclNetProperties_v7_t p7; - ncclResult_t ans = ncclNet_v7->getProperties(dev, &p7); - if (ans != ncclSuccess) return ans; - props->name = p7.name; - props->pciPath = p7.pciPath; - props->guid = p7.guid; - props->ptrSupport = p7.ptrSupport; - props->regIsGlobal = 0; - props->forceFlush = 0; - props->speed = p7.speed; - props->port = p7.port; - props->maxComms = p7.maxComms; - props->maxRecvs = p7.maxRecvs; - props->latency = p7.latency; - props->netDeviceType = p7.netDeviceType; - props->netDeviceVersion = p7.netDeviceVersion; - props->vProps.ndevs = 1; - props->vProps.devs[0] = dev; - props->maxP2pBytes = MAX_NET_SIZE; - props->maxCollBytes = MAX_COLLNET_SIZE; - return ncclSuccess; -} - -static ncclResult_t ncclNet_v7_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { - if (size >= 1UL<<31) return ncclInternalError; - return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle); -} - -static ncclResult_t ncclNet_v7_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { - int sizeInt; - if (size > MAX_NET_SIZE) return ncclInternalError; - sizeInt = (int)size; - ncclResult_t ans = ncclNet_v7->isend(sendComm, data, sizeInt, tag, mhandle, request); - return ans; -} - -static ncclResult_t ncclNet_v7_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { - int sizesInt[NCCL_PROXY_MAX_SUBS]; - //reset to NULL if optional receive completion is set - if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL; - for (int i=0; i MAX_NET_SIZE) return ncclInternalError; - sizesInt[i] = (int) sizes[i]; - } - ncclResult_t ans = ncclNet_v7->irecv(recvComm, n, data, sizesInt, tags, mhandles, request); - return ans; -} - -static ncclResult_t ncclNet_v7_as_v9_init(ncclDebugLogger_t logfn) { - NCCLCHECK(ncclNet_v7->init(logfn)); - ncclNet_v7_as_v9.name = ncclNet_v7->name; - ncclNet_v7_as_v9.devices = ncclNet_v7->devices; - ncclNet_v7_as_v9.getProperties = ncclNet_v7_as_v9_getProperties; // ncclNet_v5->getProperties; - ncclNet_v7_as_v9.listen = ncclNet_v7->listen; - ncclNet_v7_as_v9.connect = ncclNet_v7->connect; - ncclNet_v7_as_v9.accept = ncclNet_v7->accept; - ncclNet_v7_as_v9.regMr = ncclNet_v7_as_v9_regMr; - ncclNet_v7_as_v9.regMrDmaBuf = ncclNet_v7->regMrDmaBuf; - ncclNet_v7_as_v9.deregMr = ncclNet_v7->deregMr; - ncclNet_v7_as_v9.isend = ncclNet_v7_as_v9_isend; - ncclNet_v7_as_v9.irecv = ncclNet_v7_as_v9_irecv; - ncclNet_v7_as_v9.iflush = ncclNet_v7->iflush; - ncclNet_v7_as_v9.test = ncclNet_v7->test; - ncclNet_v7_as_v9.closeSend = ncclNet_v7->closeSend; - ncclNet_v7_as_v9.closeRecv = ncclNet_v7->closeRecv; - ncclNet_v7_as_v9.closeListen = ncclNet_v7->closeListen; - ncclNet_v7_as_v9.getDeviceMr = ncclNet_v7->getDeviceMr; - ncclNet_v7_as_v9.irecvConsumed = ncclNet_v7->irecvConsumed; - ncclNet_v7_as_v9.makeVDevice = NULL; - return ncclSuccess; -} - -static ncclResult_t ncclNet_v6_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { - ncclNetProperties_v6_t p6; - ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6); - if (ans != ncclSuccess) return ans; - props->name = p6.name; - props->pciPath = p6.pciPath; - props->guid = p6.guid; - props->ptrSupport = p6.ptrSupport; - props->regIsGlobal = 0; - props->forceFlush = 0; - props->speed = p6.speed; - props->port = p6.port; - props->maxComms = p6.maxComms; - props->maxRecvs = p6.maxRecvs; - props->latency = p6.latency; - props->netDeviceType = NCCL_NET_DEVICE_HOST; - props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; - props->vProps.ndevs = 1; - props->vProps.devs[0] = dev; - props->maxP2pBytes = MAX_NET_SIZE; - props->maxCollBytes = MAX_COLLNET_SIZE; - return ncclSuccess; -} - -static ncclResult_t ncclNet_v6_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { - if (size >= 1UL<<31) return ncclInternalError; - return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle); -} - -static ncclResult_t ncclNet_v6_as_v9_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { - return ncclNet_v6->connect(dev, handle, sendComm); -} - -static ncclResult_t ncclNet_v6_as_v9_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { - return ncclNet_v6->accept(listenComm, recvComm); -} - -static ncclResult_t ncclNet_v6_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { - int sizeInt; - if (size > MAX_NET_SIZE) return ncclInternalError; - sizeInt = (int)size; - ncclResult_t ans = ncclNet_v6->isend(sendComm, data, sizeInt, tag, mhandle, request); - return ans; -} - -static ncclResult_t ncclNet_v6_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { - int sizesInt[NCCL_PROXY_MAX_SUBS]; - //reset to NULL if optional receive completion is set - if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL; - for (int i=0; i MAX_NET_SIZE) return ncclInternalError; - sizesInt[i] = (int) sizes[i]; - } - ncclResult_t ans = ncclNet_v6->irecv(recvComm, n, data, sizesInt, tags, mhandles, request); - return ans; -} - -static ncclResult_t ncclNet_v6_as_v9_init(ncclDebugLogger_t logfn) { - NCCLCHECK(ncclNet_v6->init(logfn)); - ncclNet_v6_as_v9.name = ncclNet_v6->name; - ncclNet_v6_as_v9.devices = ncclNet_v6->devices; - ncclNet_v6_as_v9.getProperties = ncclNet_v6_as_v9_getProperties; - ncclNet_v6_as_v9.listen = ncclNet_v6->listen; - ncclNet_v6_as_v9.connect = ncclNet_v6_as_v9_connect; - ncclNet_v6_as_v9.accept = ncclNet_v6_as_v9_accept; - ncclNet_v6_as_v9.regMr = ncclNet_v6_as_v9_regMr; - ncclNet_v6_as_v9.regMrDmaBuf = ncclNet_v6->regMrDmaBuf; - ncclNet_v6_as_v9.deregMr = ncclNet_v6->deregMr; - ncclNet_v6_as_v9.isend = ncclNet_v6_as_v9_isend; - ncclNet_v6_as_v9.irecv = ncclNet_v6_as_v9_irecv; - ncclNet_v6_as_v9.iflush = ncclNet_v6->iflush; - ncclNet_v6_as_v9.test = ncclNet_v6->test; - ncclNet_v6_as_v9.closeSend = ncclNet_v6->closeSend; - ncclNet_v6_as_v9.closeRecv = ncclNet_v6->closeRecv; - ncclNet_v6_as_v9.closeListen = ncclNet_v6->closeListen; - ncclNet_v6_as_v9.getDeviceMr = NULL; - ncclNet_v6_as_v9.irecvConsumed = NULL; - ncclNet_v6_as_v9.makeVDevice = NULL; - return ncclSuccess; -} - -static ncclResult_t ncclNet_v5_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { - ncclNetProperties_v6_t p6; - ncclResult_t ans = ncclNet_v5->getProperties(dev, &p6); - if (ans != ncclSuccess) return ans; - props->name = p6.name; - props->pciPath = p6.pciPath; - props->guid = p6.guid; - props->ptrSupport = p6.ptrSupport; - props->regIsGlobal = 0; - props->forceFlush = 0; - props->speed = p6.speed; - props->port = p6.port; - props->maxComms = p6.maxComms; - props->maxRecvs = p6.maxRecvs; - props->latency = p6.latency; - props->netDeviceType = NCCL_NET_DEVICE_HOST; - props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; - props->vProps.ndevs = 1; - props->vProps.devs[0] = dev; - props->maxP2pBytes = MAX_NET_SIZE; - props->maxCollBytes = MAX_COLLNET_SIZE; - return ncclSuccess; -} - -static ncclResult_t ncclNet_v5_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { - if (size >= 1UL<<31) return ncclInternalError; - return ncclNet_v5->regMr(comm, data, (int) size, type, mhandle); -} - -static ncclResult_t ncclNet_v5_as_v9_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { - return ncclNet_v5->connect(dev, handle, sendComm); -} - -static ncclResult_t ncclNet_v5_as_v9_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { - return ncclNet_v5->accept(listenComm, recvComm); -} - -static ncclResult_t ncclNet_v5_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { - int sizeInt; - if (size > MAX_NET_SIZE) return ncclInternalError; - sizeInt = (int)size; - ncclResult_t ans = ncclNet_v5->isend(sendComm, data, sizeInt, tag, mhandle, request); - return ans; -} - -static ncclResult_t ncclNet_v5_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { - int sizesInt[NCCL_PROXY_MAX_SUBS]; - //reset to NULL if optional receive completion is set - if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL; - for (int i=0; i MAX_NET_SIZE) return ncclInternalError; - sizesInt[i] = (int) sizes[i]; - } - ncclResult_t ans = ncclNet_v5->irecv(recvComm, n, data, sizesInt, tags, mhandles, request); - return ans; -} - -// We use a wrapper around the v5 init to copy over the struct contents -// post-init since they may not be initialized before hand. -static ncclResult_t ncclNet_v5_as_v9_init(ncclDebugLogger_t logfn) { - NCCLCHECK(ncclNet_v5->init(logfn)); - ncclNet_v5_as_v9.name = ncclNet_v5->name; - ncclNet_v5_as_v9.devices = ncclNet_v5->devices; - ncclNet_v5_as_v9.getProperties = ncclNet_v5_as_v9_getProperties; - ncclNet_v5_as_v9.listen = ncclNet_v5->listen; - ncclNet_v5_as_v9.connect = ncclNet_v5_as_v9_connect; - ncclNet_v5_as_v9.accept = ncclNet_v5_as_v9_accept; - ncclNet_v5_as_v9.regMr = ncclNet_v5_as_v9_regMr; - ncclNet_v5_as_v9.regMrDmaBuf = NULL; - ncclNet_v5_as_v9.deregMr = ncclNet_v5->deregMr; - ncclNet_v5_as_v9.isend = ncclNet_v5_as_v9_isend; - ncclNet_v5_as_v9.irecv = ncclNet_v5_as_v9_irecv; - ncclNet_v5_as_v9.iflush = ncclNet_v5->iflush; - ncclNet_v5_as_v9.test = ncclNet_v5->test; - ncclNet_v5_as_v9.closeSend = ncclNet_v5->closeSend; - ncclNet_v5_as_v9.closeRecv = ncclNet_v5->closeRecv; - ncclNet_v5_as_v9.closeListen = ncclNet_v5->closeListen; - ncclNet_v5_as_v9.getDeviceMr = NULL; - ncclNet_v5_as_v9.irecvConsumed = NULL; - ncclNet_v5_as_v9.makeVDevice = NULL; - return ncclSuccess; -} - -static ncclResult_t ncclCollNet_v5_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { - ncclNetProperties_v6_t p6; - ncclResult_t ans = ncclCollNet_v5->getProperties(dev, &p6); - if (ans != ncclSuccess) return ans; - props->name = p6.name; - props->pciPath = p6.pciPath; - props->guid = p6.guid; - props->ptrSupport = p6.ptrSupport; - props->regIsGlobal = 0; - props->forceFlush = 0; - props->speed = p6.speed; - props->port = p6.port; - props->maxComms = p6.maxComms; - props->maxRecvs = p6.maxRecvs; - props->latency = p6.latency; - props->netDeviceType = NCCL_NET_DEVICE_HOST; - props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; - props->vProps.ndevs = 1; - props->vProps.devs[0] = dev; - props->maxP2pBytes = MAX_NET_SIZE; - props->maxCollBytes = MAX_COLLNET_SIZE; - return ncclSuccess; -} - -static ncclResult_t ncclCollNet_v5_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { - if (size >= 1UL<<31) return ncclInternalError; - return ncclCollNet_v5->regMr(comm, data, (int) size, type, mhandle); -} - -static ncclResult_t ncclCollNet_v5_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { - int countInt; - if (count > MAX_NET_SIZE) return ncclInternalError; - countInt = (int)count; - ncclResult_t ans = ncclCollNet_v5->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp, - sendMhandle, recvMhandle, request); - return ans; -} - -// We use a wrapper around the v5 init to copy over the struct contents -// post-init since they may not be initialized before hand. -static ncclResult_t ncclCollNet_v5_as_v9_init(ncclDebugLogger_t logfn) { - NCCLCHECK(ncclCollNet_v5->init(logfn)); - ncclCollNet_v5_as_v9.name = ncclCollNet_v5->name; - ncclCollNet_v5_as_v9.devices = ncclCollNet_v5->devices; - ncclCollNet_v5_as_v9.getProperties = ncclCollNet_v5_as_v9_getProperties; - ncclCollNet_v5_as_v9.listen = ncclCollNet_v5->listen; - ncclCollNet_v5_as_v9.connect = ncclCollNet_v5->connect; - ncclCollNet_v5_as_v9.reduceSupport = ncclCollNet_v5->reduceSupport; - ncclCollNet_v5_as_v9.regMr = ncclCollNet_v5_as_v9_regMr; - ncclCollNet_v5_as_v9.regMrDmaBuf = NULL; - ncclCollNet_v5_as_v9.deregMr = ncclCollNet_v5->deregMr; - ncclCollNet_v5_as_v9.iallreduce = ncclCollNet_v5_as_v9_iallreduce; - ncclCollNet_v5_as_v9.iallgather = nullptr; - ncclCollNet_v5_as_v9.ireducescatter = nullptr; - ncclCollNet_v5_as_v9.iflush = ncclCollNet_v5->iflush; - ncclCollNet_v5_as_v9.test = ncclCollNet_v5->test; - ncclCollNet_v5_as_v9.closeColl = ncclCollNet_v5->closeColl; - ncclCollNet_v5_as_v9.closeListen = ncclCollNet_v5->closeListen; - return ncclSuccess; -} - -static ncclResult_t ncclCollNet_v6_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { - ncclNetProperties_v6_t p6; - ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6); - if (ans != ncclSuccess) return ans; - props->name = p6.name; - props->pciPath = p6.pciPath; - props->guid = p6.guid; - props->ptrSupport = p6.ptrSupport; - props->regIsGlobal = 0; - props->forceFlush = 0; - props->speed = p6.speed; - props->port = p6.port; - props->maxComms = p6.maxComms; - props->maxRecvs = p6.maxRecvs; - props->latency = p6.latency; - props->netDeviceType = NCCL_NET_DEVICE_HOST; - props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; - props->vProps.ndevs = 1; - props->vProps.devs[0] = dev; - props->maxP2pBytes = MAX_NET_SIZE; - props->maxCollBytes = MAX_COLLNET_SIZE; - return ncclSuccess; -} - -static ncclResult_t ncclCollNet_v6_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { - if (size >= 1UL<<31) return ncclInternalError; - return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle); -} - -static ncclResult_t ncclCollNet_v6_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { - int countInt; - if (count > MAX_NET_SIZE) return ncclInternalError; - countInt = (int)count; - ncclResult_t ans = ncclCollNet_v6->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp, - sendMhandle, recvMhandle, request); - return ans; -} - -// We use a wrapper around the v6 init to copy over the struct contents -// post-init since they may not be initialized before hand. -static ncclResult_t ncclCollNet_v6_as_v9_init(ncclDebugLogger_t logfn) { - NCCLCHECK(ncclCollNet_v6->init(logfn)); - ncclCollNet_v6_as_v9.name = ncclCollNet_v6->name; - ncclCollNet_v6_as_v9.devices = ncclCollNet_v6->devices; - ncclCollNet_v6_as_v9.getProperties = ncclCollNet_v6_as_v9_getProperties; - ncclCollNet_v6_as_v9.listen = ncclCollNet_v6->listen; - ncclCollNet_v6_as_v9.connect = ncclCollNet_v6->connect; - ncclCollNet_v6_as_v9.reduceSupport = ncclCollNet_v6->reduceSupport; - ncclCollNet_v6_as_v9.regMr = ncclCollNet_v6_as_v9_regMr; - ncclCollNet_v6_as_v9.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf; - ncclCollNet_v6_as_v9.deregMr = ncclCollNet_v6->deregMr; - ncclCollNet_v6_as_v9.iallreduce = ncclCollNet_v6_as_v9_iallreduce; - ncclCollNet_v6_as_v9.iallgather = nullptr; - ncclCollNet_v6_as_v9.ireducescatter = nullptr; - ncclCollNet_v6_as_v9.iflush = ncclCollNet_v6->iflush; - ncclCollNet_v6_as_v9.test = ncclCollNet_v6->test; - ncclCollNet_v6_as_v9.closeColl = ncclCollNet_v6->closeColl; - ncclCollNet_v6_as_v9.closeListen = ncclCollNet_v6->closeListen; - return ncclSuccess; -} - -static ncclResult_t ncclCollNet_v7_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { - ncclNetProperties_v7_t p7; - ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7); - if (ans != ncclSuccess) return ans; - props->name = p7.name; - props->pciPath = p7.pciPath; - props->guid = p7.guid; - props->ptrSupport = p7.ptrSupport; - props->regIsGlobal = 0; - props->forceFlush = 0; - props->speed = p7.speed; - props->port = p7.port; - props->maxComms = p7.maxComms; - props->maxRecvs = p7.maxRecvs; - props->latency = p7.latency; - props->netDeviceType = NCCL_NET_DEVICE_HOST; - props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; - props->vProps.ndevs = 1; - props->vProps.devs[0] = dev; - props->maxP2pBytes = MAX_NET_SIZE; - props->maxCollBytes = MAX_COLLNET_SIZE; - return ncclSuccess; -} - -static ncclResult_t ncclCollNet_v7_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { - if (size >= 1UL<<31) return ncclInternalError; - return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle); -} - -static ncclResult_t ncclCollNet_v7_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { - int countInt; - if (count > MAX_NET_SIZE) return ncclInternalError; - countInt = (int)count; - ncclResult_t ans = ncclCollNet_v7->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp, - sendMhandle, recvMhandle, request); - return ans; -} - -// We use a wrapper around the v7 init to copy over the struct contents -// post-init since they may not be initialized before hand. -static ncclResult_t ncclCollNet_v7_as_v9_init(ncclDebugLogger_t logfn) { - NCCLCHECK(ncclCollNet_v7->init(logfn)); - ncclCollNet_v7_as_v9.name = ncclCollNet_v7->name; - ncclCollNet_v7_as_v9.devices = ncclCollNet_v7->devices; - ncclCollNet_v7_as_v9.getProperties = ncclCollNet_v7_as_v9_getProperties; - ncclCollNet_v7_as_v9.listen = ncclCollNet_v7->listen; - ncclCollNet_v7_as_v9.connect = ncclCollNet_v7->connect; - ncclCollNet_v7_as_v9.reduceSupport = ncclCollNet_v7->reduceSupport; - ncclCollNet_v7_as_v9.regMr = ncclCollNet_v7_as_v9_regMr; - ncclCollNet_v7_as_v9.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf; - ncclCollNet_v7_as_v9.deregMr = ncclCollNet_v7->deregMr; - ncclCollNet_v7_as_v9.iallreduce = ncclCollNet_v7_as_v9_iallreduce; - ncclCollNet_v7_as_v9.iallgather = nullptr; - ncclCollNet_v7_as_v9.ireducescatter = nullptr; - ncclCollNet_v7_as_v9.iflush = ncclCollNet_v7->iflush; - ncclCollNet_v7_as_v9.test = ncclCollNet_v7->test; - ncclCollNet_v7_as_v9.closeColl = ncclCollNet_v7->closeColl; - ncclCollNet_v7_as_v9.closeListen = ncclCollNet_v7->closeListen; - return ncclSuccess; -} - -static ncclResult_t ncclCollNet_v8_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) { - ncclNetProperties_v8_t p8; - ncclResult_t ans = ncclCollNet_v8->getProperties(dev, &p8); - if (ans != ncclSuccess) return ans; - props->name = p8.name; - props->pciPath = p8.pciPath; - props->guid = p8.guid; - props->ptrSupport = p8.ptrSupport; - props->regIsGlobal = p8.regIsGlobal; - props->forceFlush = 0; - props->speed = p8.speed; - props->port = p8.port; - props->maxComms = p8.maxComms; - props->maxRecvs = p8.maxRecvs; - props->latency = p8.latency; - props->netDeviceType = NCCL_NET_DEVICE_HOST; - props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; - props->vProps.ndevs = 1; - props->vProps.devs[0] = dev; - props->maxP2pBytes = MAX_NET_SIZE; - props->maxCollBytes = MAX_COLLNET_SIZE; - return ncclSuccess; -} - -static ncclResult_t ncclCollNet_v8_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { - int countInt; - if (count > MAX_NET_SIZE) return ncclInternalError; - countInt = (int)count; - ncclResult_t ans = ncclCollNet_v8->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp, - sendMhandle, recvMhandle, request); - return ans; -} - -static ncclResult_t ncclCollNet_v8_as_v9_iallgather (void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts, - size_t bytesPerRank, size_t windowOffset, size_t windowBytes, - void* sendMhandle, void** request) { - ncclNetSGE_v8_t recvPartsInt; - if (nRecvParts > 1) return ncclInternalError; - if (recvParts->size > MAX_COLLNET_SIZE) return ncclInternalError; - recvPartsInt.mhandle = recvParts->mhandle; - recvPartsInt.address = recvParts->address; - recvPartsInt.size = (int)recvParts->size; - ncclResult_t ans = ncclCollNet_v8->iallgather(collComm, sendData, nRecvParts, &recvPartsInt, - bytesPerRank, windowOffset, windowBytes, - sendMhandle, request); - return ans; -} - -static ncclResult_t ncclCollNet_v8_as_v9_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData, - size_t bytesPerRank, size_t windowOffset, size_t windowBytes, - ncclDataType_t dataType, ncclRedOp_t redOp, - void* recvMhandle, void** request) { - ncclNetSGE_v8_t sendPartsInt; - if (nSendParts > 1) return ncclInternalError; - if (sendParts->size > MAX_COLLNET_SIZE) return ncclInternalError; - sendPartsInt.mhandle = sendParts->mhandle; - sendPartsInt.address = sendParts->address; - sendPartsInt.size = (int)sendParts->size; - ncclResult_t ans = ncclCollNet_v8->ireducescatter(collComm, nSendParts, &sendPartsInt, - recvData, bytesPerRank, windowOffset, windowBytes, - dataType, redOp, - recvMhandle, request); - return ans; -} - -// We use a wrapper around the v8 init to copy over the struct contents -// post-init since they may not be initialized before hand. -static ncclResult_t ncclCollNet_v8_as_v9_init(ncclDebugLogger_t logfn) { - NCCLCHECK(ncclCollNet_v8->init(logfn)); - ncclCollNet_v8_as_v9.name = ncclCollNet_v8->name; - ncclCollNet_v8_as_v9.devices = ncclCollNet_v8->devices; - ncclCollNet_v8_as_v9.getProperties = ncclCollNet_v8_as_v9_getProperties; - ncclCollNet_v8_as_v9.listen = ncclCollNet_v8->listen; - ncclCollNet_v8_as_v9.connect = ncclCollNet_v8->connect; - ncclCollNet_v8_as_v9.reduceSupport = ncclCollNet_v8->reduceSupport; - ncclCollNet_v8_as_v9.regMr = ncclCollNet_v8->regMr; - ncclCollNet_v8_as_v9.regMrDmaBuf = ncclCollNet_v8->regMrDmaBuf; - ncclCollNet_v8_as_v9.deregMr = ncclCollNet_v8->deregMr; - ncclCollNet_v8_as_v9.iallreduce = ncclCollNet_v8_as_v9_iallreduce; - ncclCollNet_v8_as_v9.iallgather = ncclCollNet_v8_as_v9_iallgather; - ncclCollNet_v8_as_v9.ireducescatter = ncclCollNet_v8_as_v9_ireducescatter; - ncclCollNet_v8_as_v9.iflush = ncclCollNet_v8->iflush; - ncclCollNet_v8_as_v9.test = ncclCollNet_v8->test; - ncclCollNet_v8_as_v9.closeColl = ncclCollNet_v8->closeColl; - ncclCollNet_v8_as_v9.closeListen = ncclCollNet_v8->closeListen; - return ncclSuccess; -} - -static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER; -ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket }; -ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr }; -enum ncclNetState { - ncclNetStateInit = 0, - ncclNetStateEnabled = 1, - ncclNetStateDisabled = 2 -}; -enum ncclNetState ncclNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; -enum ncclNetState ncclCollNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; - -#define MAX_STR_LEN 255 - -static void* tryOpenLib(char* name, int* err, char* errStr) { - *err = 0; - if (nullptr == name || strlen(name) == 0) { - return nullptr; - } - - if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) { - name = nullptr; - } - - void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL); - if (nullptr == handle) { - strncpy(errStr, dlerror(), MAX_STR_LEN); - errStr[MAX_STR_LEN] = '\0'; - // "handle" and "name" won't be NULL at the same time. - // coverity[var_deref_model] - if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) { - *err = ENOENT; - } - } - return handle; -} - -static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) { - if (openErr == ENOENT) { - snprintf(nameList, *nameListLen, " %s", name); - nameList += strlen(name) + 1; - *nameListLen -= strlen(name) + 1; - return nameList; - } - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: %s", openErrStr); - return nameList; -} - -static void* openNetPluginLib(char* couldNotFindNames, int len) { - int openErr; - void *pluginLib; - char netPluginLibName[PATH_MAX]; - char openErrStr[MAX_STR_LEN + 1] = { 0 }; - const char *envNetPluginName = getenv("NCCL_NET_PLUGIN"); - if (envNetPluginName && strlen(envNetPluginName)) { - snprintf(netPluginLibName, PATH_MAX, "%s", envNetPluginName); - pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr); - if (pluginLib) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin name set by env to %s", netPluginLibName); - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName); - - snprintf(netPluginLibName, PATH_MAX, "libnccl-net-%s.so", envNetPluginName); - pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr); - if (pluginLib) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin name set by env to %s", netPluginLibName); - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName); - } else { - snprintf(netPluginLibName, PATH_MAX, "libnccl-net.so"); - pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr); - if (pluginLib) { - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName); - } - return nullptr; -} - -static pthread_mutex_t netPluginLock = PTHREAD_MUTEX_INITIALIZER; -static int netPluginRefCount; -static void* netPluginLib; - -enum { - netPluginLoadFailed = -1, - netPluginLoadReady = 0, - netPluginLoadSuccess = 1, -}; - -static int netPluginStatus = netPluginLoadReady; - -#define MAX_PLUGIN_LOAD 2 - -ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) { - char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 }; - pthread_mutex_lock(&netPluginLock); - if (netPluginLoadFailed == netPluginStatus) { - goto exit; - } - if (netPluginLoadSuccess == netPluginStatus) { - ++netPluginRefCount; - goto exit; - } - - netPluginLib = openNetPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX); - if (netPluginLib == nullptr) { - if (strlen(couldNotFindNames)) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Could not find:%s. Using internal network plugin.", couldNotFindNames); - } else { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Using internal network plugin."); - } - goto fail; - } - - ncclNets[0] = (ncclNet_v9_t*)dlsym(netPluginLib, "ncclNetPlugin_v9"); - if (ncclNets[0] == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v9 symbol."); - ncclNet_v8 = (ncclNet_v8_t*)dlsym(netPluginLib, "ncclNetPlugin_v8"); - if (ncclNet_v8 == nullptr) { - // Try v7 plugin - ncclNet_v7 = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7"); - if (ncclNet_v7 == nullptr) { - // Try v6 plugin - ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6"); - if (ncclNet_v6 == nullptr) { - // Try v5 plugin - ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5"); - if (ncclNet_v5 == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported."); - goto fail; - } else { - ncclNets[0] = &ncclNet_v5_as_v9; - ncclNet_v5_as_v9.init = ncclNet_v5_as_v9_init; - // Set the name right away to allow for NCCL_NET=... to work - ncclNet_v5_as_v9.name = ncclNet_v5->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name); - } - } else { - ncclNets[0] = &ncclNet_v6_as_v9; - ncclNet_v6_as_v9.init = ncclNet_v6_as_v9_init; - // Set the name right away to allow for NCCL_NET=... to work - ncclNet_v6_as_v9.name = ncclNet_v6->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name); - } - } else { - ncclNets[0] = &ncclNet_v7_as_v9; - ncclNet_v7_as_v9.init = ncclNet_v7_as_v9_init; - // Set the name right away to allow for NCCL_NET=... to work - ncclNet_v7_as_v9.name = ncclNet_v7->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNets[0]->name); - } - } else { - ncclNets[0] = &ncclNet_v8_as_v9; - ncclNet_v8_as_v9.init = ncclNet_v8_as_v9_init; - // Set the name right away to allow for NCCL_NET=... to work - ncclNet_v8_as_v9.name = ncclNet_v8->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v8)", ncclNets[0]->name); - } - } else { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v9)", ncclNets[0]->name); - } - - // Check for CollNet - ncclCollNets[0] = (ncclCollNet_v9_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v9"); - if (ncclCollNets[0] == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v9 symbol."); - ncclCollNet_v8 = (ncclCollNet_v8_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v8"); - if (ncclCollNet_v8 == nullptr) { - ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v7"); - if (ncclCollNet_v7 == nullptr) { - ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6"); - if (ncclCollNet_v6 == nullptr) { - ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5"); - if (ncclCollNet_v5 == nullptr) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported."); - } else { - ncclCollNets[0] = &ncclCollNet_v5_as_v9; - ncclCollNet_v5_as_v9.init = ncclCollNet_v5_as_v9_init; - ncclCollNet_v5_as_v9.name = ncclCollNet_v5->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v5)", ncclCollNets[0]->name); - } - } else { - ncclCollNets[0] = &ncclCollNet_v6_as_v9; - ncclCollNet_v6_as_v9.init = ncclCollNet_v6_as_v9_init; - ncclCollNet_v6_as_v9.name = ncclCollNet_v6->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNets[0]->name); - } - } else { - ncclCollNets[0] = &ncclCollNet_v7_as_v9; - ncclCollNet_v7_as_v9.init = ncclCollNet_v7_as_v9_init; - ncclCollNet_v7_as_v9.name = ncclCollNet_v7->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNets[0]->name); - } - } else { - ncclCollNets[0] = &ncclCollNet_v8_as_v9; - ncclCollNet_v8_as_v9.init = ncclCollNet_v8_as_v9_init; - ncclCollNet_v8_as_v9.name = ncclCollNet_v8->name; - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v8)", ncclCollNets[0]->name); - } - } else { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v9)", ncclCollNets[0]->name); - } - - ++netPluginRefCount; - netPluginStatus = netPluginLoadSuccess; - comm->netPluginLoaded = 1; - -exit: - pthread_mutex_unlock(&netPluginLock); - return ncclSuccess; -fail: - if (netPluginLib) dlclose(netPluginLib); - netPluginStatus = netPluginLoadFailed; - goto exit; -} - -ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) { - pthread_mutex_lock(&netPluginLock); - if (comm->netPluginLoaded && 0 == (--netPluginRefCount)) { - if (ncclNets[0]) { - INFO(NCCL_NET, "NET/Plugin: Closing net plugin '%s'", ncclNets[0]->name); - } - if (ncclCollNets[0]) { - INFO(NCCL_NET, "NET/Plugin: Closing collnet plugin '%s'", ncclCollNets[0]->name); - } - dlclose(netPluginLib); - netPluginLib = nullptr; - ncclNets[0] = nullptr; - ncclCollNets[0] = nullptr; - netPluginStatus = netPluginLoadReady; - comm->netPluginLoaded = 0; - for (int i = 0; i < NCCL_NET_MAX_PLUGINS; ++i) - ncclCollNetStates[i] = ncclNetStates[i] = ncclNetStateInit; - } - pthread_mutex_unlock(&netPluginLock); - return ncclSuccess; -} - -ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) { - ncclNetProperties_t props; - - NCCLCHECK(net->getProperties(dev, &props)); - ncclNetDeviceType type = props.netDeviceType; - if (type) switch (type) { - case NCCL_NET_DEVICE_UNPACK: - if (props.netDeviceVersion == NCCL_NET_DEVICE_UNPACK_VERSION) { - INFO(NCCL_INIT, "Using NCCL_NET_DEVICE_UNPACK net plugin version %d", - props.netDeviceVersion); - return ncclSuccess; - } else { - WARN("NCCL_DEVICE_UNPACK plugin has incompatible version %d, this NCCL build is compatible with %d, not using it", - props.netDeviceVersion, NCCL_NET_DEVICE_UNPACK_VERSION); - return ncclInternalError; - } - default: - WARN("Unknown device code index %d \n", type); - return ncclInternalError; - } - - return ncclSuccess; -} - -static ncclResult_t netGetState(int i, enum ncclNetState* state) { - pthread_mutex_lock(&netLock); - if (ncclNetStates[i] == ncclNetStateInit) { - int ndev; - if (ncclNets[i]->init(ncclDebugLog) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled; - else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled; - else ncclNetStates[i] = ncclNetStateEnabled; - } - *state = ncclNetStates[i]; - pthread_mutex_unlock(&netLock); - return ncclSuccess; -} - -static ncclResult_t collNetGetState(int i, enum ncclNetState* state) { - pthread_mutex_lock(&netLock); - if (ncclCollNetStates[i] == ncclNetStateInit) { - int ndev; - if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled; - else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled; - else ncclCollNetStates[i] = ncclNetStateEnabled; - } - *state = ncclCollNetStates[i]; - pthread_mutex_unlock(&netLock); - return ncclSuccess; -} - -ncclResult_t ncclNetInit(struct ncclComm* comm) { - // Initialize main communication network - const char* netName; - bool ok = false; - - netName = comm->config.netName; - for (int i=0; i<3; i++) { - if (ncclNets[i] == nullptr) continue; - enum ncclNetState state; - NCCLCHECK(netGetState(i, &state)); - if (state != ncclNetStateEnabled) continue; - if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue; - if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) { - // Mismatched device plugin version - continue; - } - - comm->ncclNet = ncclNets[i]; - ok = true; - - if (ncclCollNets[i]) { - NCCLCHECK(collNetGetState(i, &state)); - if (state == ncclNetStateEnabled) { - comm->ncclCollNet = ncclCollNets[i]; - } - } - break; - } - - if (!ok) { - WARN("Error: network %s not found.", netName ? netName : ""); - return ncclInvalidUsage; - } - return ncclSuccess; -} - -ncclResult_t ncclNetFinalize(struct ncclComm* comm) { - comm->ncclNet = nullptr; - comm->ncclCollNet = nullptr; - return ncclSuccess; -} - -ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) { - constexpr int GPU_BUF_SIZE = 2*1024*1024; -#if CUDART_VERSION >= 11030 - // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute - int driverVersion; - CUDACHECK(cudaDriverGetVersion(&driverVersion)); - if (driverVersion >= 11030) { - int cudaDev, attr = 0; - CUDACHECK(cudaGetDevice(&cudaDev)); - CUDACHECK(cudaDeviceGetAttribute(&attr, cudaDevAttrGPUDirectRDMASupported, cudaDev)); - *gdrSupport = attr; - return ncclSuccess; - } -#endif - static int gdrSupportMatrix[32] = { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; - if (gdrSupportMatrix[comm->cudaDev] == -1) { - int netDevs; - NCCLCHECK(comm->ncclNet->devices(&netDevs)); - gdrSupportMatrix[comm->cudaDev] = 0; - for (int dev=0; devncclNet->getProperties(dev, &props)); - if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue; - - // Allocate memory on the GPU and try to register it on the NIC. - void *lComm = NULL, *sComm = NULL, *rComm = NULL; - ncclNetHandle_t handle; - char* gpuPtr = NULL; - void* mHandle = NULL; - ncclResult_t ret; - ncclDebugNoWarn = NCCL_NET; - NCCLCHECKGOTO(comm->ncclNet->listen(dev, &handle, &lComm), ret, cleanup1); - - bool connected; - connected = false; - while (!connected) { - - // If we're aborting now, skip to cleanup - if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) { - goto cleanup2; - } - - if (sComm == NULL) - NCCLCHECKGOTO(comm->ncclNet->connect(dev, &handle, &sComm, NULL), ret, cleanup2); - - if (rComm == NULL) - NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2); - - connected = (rComm != NULL) && (sComm != NULL); - } - - NCCLCHECKGOTO(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2); - if (comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) { - NCCLCHECK(comm->ncclNet->deregMr(sComm, mHandle)); - NCCLCHECK(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle)); - NCCLCHECK(comm->ncclNet->deregMr(rComm, mHandle)); - gdrSupportMatrix[comm->cudaDev] = 1; - } - ncclDebugNoWarn = 0; - NCCLCHECK(ncclCudaFree(gpuPtr)); -cleanup2: - if (rComm != NULL) - NCCLCHECK(comm->ncclNet->closeRecv(rComm)); - if (sComm != NULL) - NCCLCHECK(comm->ncclNet->closeSend(sComm)); - NCCLCHECK(comm->ncclNet->closeListen(lComm)); -cleanup1: - break; - } - } - *gdrSupport = gdrSupportMatrix[comm->cudaDev]; - return ncclSuccess; -} - -int ncclNetVersion(struct ncclComm* comm) { - return - (comm->ncclNet == &ncclNet_v5_as_v9) ? 5 : - (comm->ncclNet == &ncclNet_v6_as_v9) ? 6 : - (comm->ncclNet == &ncclNet_v7_as_v9) ? 7 : - (comm->ncclNet == &ncclNet_v8_as_v9) ? 8 : - 9; -} diff --git a/src/plugin/net.cc b/src/plugin/net.cc new file mode 100644 index 000000000..9257d7786 --- /dev/null +++ b/src/plugin/net.cc @@ -0,0 +1,319 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "net.h" +#include "bootstrap.h" +#include "checks.h" +#include "plugin.h" + +#include +#include +//#include +//#include +//#include + +extern ncclNet_t* getNcclNet_v6(void* netPluginLib); +extern ncclNet_t* getNcclNet_v7(void* netPluginLib); +extern ncclNet_t* getNcclNet_v8(void* netPluginLib); +extern ncclNet_t* getNcclNet_v9(void* netPluginLib); +extern ncclNet_t* getNcclNet_v10(void* netPluginLib); + +extern ncclCollNet_t* getNcclCollNet_v6(void* netPluginLib); +extern ncclCollNet_t* getNcclCollNet_v7(void* netPluginLib); +extern ncclCollNet_t* getNcclCollNet_v8(void* netPluginLib); +extern ncclCollNet_t* getNcclCollNet_v9(void* netPluginLib); +extern ncclCollNet_t* getNcclCollNet_v10(void* netPluginLib); + +static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER; +ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket }; +static int ncclNetsVer[NCCL_NET_MAX_PLUGINS] = { -1, 10, 10 }; +ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr }; +enum ncclNetState { + ncclNetStateInit = 0, + ncclNetStateEnabled = 1, + ncclNetStateDisabled = 2 +}; +enum ncclNetState ncclNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; +enum ncclNetState ncclCollNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; + +NCCL_PARAM(NetPluginRefCount, "NET_PLUGIN_REF_COUNT", 1); +static pthread_mutex_t netPluginLock = PTHREAD_MUTEX_INITIALIZER; +static void* netPluginLib; + +static int netPluginRefCount; +static void initNetPluginRefCountOnce(void) { netPluginRefCount = ncclParamNetPluginRefCount();} + +enum { + netPluginLoadFailed = -1, + netPluginLoadReady = 0, + netPluginLoadSuccess = 1, +}; + +static int netPluginStatus = netPluginLoadReady; + +ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) { + static pthread_once_t netPluginRefCountOnce = PTHREAD_ONCE_INIT; + pthread_once(&netPluginRefCountOnce, initNetPluginRefCountOnce); + + pthread_mutex_lock(&netPluginLock); + if (netPluginLoadFailed == netPluginStatus) { + goto exit; + } + if (netPluginLoadSuccess == netPluginStatus) { + ++netPluginRefCount; + goto exit; + } + + netPluginLib = ncclOpenNetPluginLib(ncclGetEnv("NCCL_NET_PLUGIN")); + if (netPluginLib == nullptr) { + goto fail; + } + + ncclNets[0] = getNcclNet_v10(netPluginLib); + if (ncclNets[0]) ncclNetsVer[0] = 10; + if (ncclNets[0] == nullptr) { + // Try v9 plugin + ncclNets[0] = getNcclNet_v9(netPluginLib); + if (ncclNets[0]) ncclNetsVer[0] = 9; + } + if (ncclNets[0] == nullptr) { + // Try v8 plugin + ncclNets[0] = getNcclNet_v8(netPluginLib); + if (ncclNets[0]) ncclNetsVer[0] = 8; + } + if (ncclNets[0] == nullptr) { + // Try v7 plugin + ncclNets[0] = getNcclNet_v7(netPluginLib); + if (ncclNets[0]) ncclNetsVer[0] = 7; + } + if (ncclNets[0] == nullptr) { + // Try v6 plugin + ncclNets[0] = getNcclNet_v6(netPluginLib); + if (ncclNets[0]) ncclNetsVer[0] = 6; + } + if (ncclNets[0] == nullptr) { + goto fail; + } + + // Check for CollNet + ncclCollNets[0] = getNcclCollNet_v10(netPluginLib); + if (ncclCollNets[0] == nullptr) { + ncclCollNets[0] = getNcclCollNet_v9(netPluginLib); + } + if (ncclCollNets[0] == nullptr) { + ncclCollNets[0] = getNcclCollNet_v8(netPluginLib); + } + if (ncclCollNets[0] == nullptr) { + ncclCollNets[0] = getNcclCollNet_v7(netPluginLib); + } + if (ncclCollNets[0] == nullptr) { + ncclCollNets[0] = getNcclCollNet_v6(netPluginLib); + } + + ++netPluginRefCount; + netPluginStatus = netPluginLoadSuccess; + comm->netPluginLoaded = 1; + +exit: + pthread_mutex_unlock(&netPluginLock); + return ncclSuccess; +fail: + if (netPluginLib) NCCLCHECK(ncclClosePluginLib(netPluginLib)); + netPluginStatus = netPluginLoadFailed; + goto exit; +} + +ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) { + pthread_mutex_lock(&netPluginLock); + if (comm->netPluginLoaded && 0 == (--netPluginRefCount)) { + if (ncclNets[0]) { + INFO(NCCL_NET, "NET/Plugin: Closing net plugin '%s'", ncclNets[0]->name); + } + if (ncclCollNets[0]) { + INFO(NCCL_NET, "NET/Plugin: Closing collnet plugin '%s'", ncclCollNets[0]->name); + } + NCCLCHECK(ncclClosePluginLib(netPluginLib)); + netPluginLib = nullptr; + ncclNets[0] = nullptr; + ncclCollNets[0] = nullptr; + netPluginStatus = netPluginLoadReady; + comm->netPluginLoaded = 0; + for (int i = 0; i < NCCL_NET_MAX_PLUGINS; ++i) + ncclCollNetStates[i] = ncclNetStates[i] = ncclNetStateInit; + } + pthread_mutex_unlock(&netPluginLock); + return ncclSuccess; +} + +ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) { + ncclNetProperties_t props; + + NCCLCHECK(net->getProperties(dev, &props)); + ncclNetDeviceType type = props.netDeviceType; + if (type) switch (type) { + case NCCL_NET_DEVICE_UNPACK: + if (props.netDeviceVersion == NCCL_NET_DEVICE_UNPACK_VERSION) { + INFO(NCCL_INIT, "Using NCCL_NET_DEVICE_UNPACK net plugin version %d", + props.netDeviceVersion); + return ncclSuccess; + } else { + WARN("NCCL_DEVICE_UNPACK plugin has incompatible version %d, this NCCL build is compatible with %d, not using it", + props.netDeviceVersion, NCCL_NET_DEVICE_UNPACK_VERSION); + return ncclInternalError; + } + default: + WARN("Unknown device code index %d \n", type); + return ncclInternalError; + } + + return ncclSuccess; +} + +static ncclResult_t netGetState(int i, enum ncclNetState* state) { + pthread_mutex_lock(&netLock); + if (ncclNetStates[i] == ncclNetStateInit) { + int ndev; + if (ncclNets[i]->init(ncclDebugLog, ncclProfilerCallback) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled; + else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled; + else ncclNetStates[i] = ncclNetStateEnabled; + } + *state = ncclNetStates[i]; + pthread_mutex_unlock(&netLock); + return ncclSuccess; +} + +static ncclResult_t collNetGetState(int i, enum ncclNetState* state) { + pthread_mutex_lock(&netLock); + if (ncclCollNetStates[i] == ncclNetStateInit) { + int ndev; + if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled; + else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled; + else ncclCollNetStates[i] = ncclNetStateEnabled; + } + *state = ncclCollNetStates[i]; + pthread_mutex_unlock(&netLock); + return ncclSuccess; +} + +ncclResult_t ncclNetInit(struct ncclComm* comm) { + // Initialize main communication network + const char* netName; + bool ok = false; + + netName = comm->config.netName; + for (int i=0; i<3; i++) { + if (ncclNets[i] == nullptr) continue; + enum ncclNetState state; + NCCLCHECK(netGetState(i, &state)); + if (state != ncclNetStateEnabled) continue; + if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue; + if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) { + // Mismatched device plugin version + continue; + } + + comm->ncclNet = ncclNets[i]; + comm->ncclNetVer = ncclNetsVer[i]; + ok = true; + + if (ncclCollNets[i]) { + NCCLCHECK(collNetGetState(i, &state)); + if (state == ncclNetStateEnabled) { + comm->ncclCollNet = ncclCollNets[i]; + } + } + break; + } + + if (!ok) { + WARN("Error: network %s not found.", netName ? netName : ""); + return ncclInvalidUsage; + } + return ncclSuccess; +} + +ncclResult_t ncclNetFinalize(struct ncclComm* comm) { + comm->ncclNet = nullptr; + comm->ncclCollNet = nullptr; + return ncclSuccess; +} + +ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) { + constexpr int GPU_BUF_SIZE = 2*1024*1024; +#if CUDART_VERSION >= 11030 + // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute + int driverVersion; + CUDACHECK(cudaDriverGetVersion(&driverVersion)); + if (driverVersion >= 11030) { + int cudaDev, attr = 0; + CUDACHECK(cudaGetDevice(&cudaDev)); + CUDACHECK(cudaDeviceGetAttribute(&attr, cudaDevAttrGPUDirectRDMASupported, cudaDev)); + *gdrSupport = attr; + return ncclSuccess; + } +#endif + static int gdrSupportMatrix[32] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; + if (gdrSupportMatrix[comm->cudaDev] == -1) { + int netDevs; + NCCLCHECK(comm->ncclNet->devices(&netDevs)); + gdrSupportMatrix[comm->cudaDev] = 0; + for (int dev=0; devncclNet->getProperties(dev, &props)); + if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue; + + // Allocate memory on the GPU and try to register it on the NIC. + void *lComm = NULL, *sComm = NULL, *rComm = NULL; + ncclNetHandle_t handle; + char* gpuPtr = NULL; + void* mHandle = NULL; + ncclResult_t ret; + ncclDebugNoWarn = NCCL_NET; + NCCLCHECKGOTO(comm->ncclNet->listen(dev, &handle, &lComm), ret, cleanup1); + + bool connected; + connected = false; + while (!connected) { + + // If we're aborting now, skip to cleanup + if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) { + goto cleanup2; + } + + if (sComm == NULL) + NCCLCHECKGOTO(comm->ncclNet->connect(dev, NULL, &handle, &sComm, NULL), ret, cleanup2); + + if (rComm == NULL) + NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2); + + connected = (rComm != NULL) && (sComm != NULL); + } + + NCCLCHECKGOTO(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2); + if (comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) { + NCCLCHECK(comm->ncclNet->deregMr(sComm, mHandle)); + NCCLCHECK(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle)); + NCCLCHECK(comm->ncclNet->deregMr(rComm, mHandle)); + gdrSupportMatrix[comm->cudaDev] = 1; + } + ncclDebugNoWarn = 0; + NCCLCHECK(ncclCudaFree(gpuPtr)); +cleanup2: + if (rComm != NULL) + NCCLCHECK(comm->ncclNet->closeRecv(rComm)); + if (sComm != NULL) + NCCLCHECK(comm->ncclNet->closeSend(sComm)); + NCCLCHECK(comm->ncclNet->closeListen(lComm)); +cleanup1: + break; + } + } + *gdrSupport = gdrSupportMatrix[comm->cudaDev]; + return ncclSuccess; +} diff --git a/src/plugin/net/net_v10.cc b/src/plugin/net/net_v10.cc new file mode 100644 index 000000000..682f239f7 --- /dev/null +++ b/src/plugin/net/net_v10.cc @@ -0,0 +1,32 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "nccl_net.h" +#include "net_device.h" +#include "proxy.h" + +static ncclNet_v10_t* ncclNet_v10; +static ncclCollNet_v10_t* ncclCollNet_v10; + +ncclNet_t* getNcclNet_v10(void* lib) { + ncclNet_v10 = (ncclNet_v10_t*)dlsym(lib, "ncclNetPlugin_v10"); + if (ncclNet_v10) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v10)", ncclNet_v10->name); + return ncclNet_v10; + } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v10 symbol."); + return nullptr; +} + +ncclCollNet_t* getNcclCollNet_v10(void* lib) { + ncclCollNet_v10 = (ncclCollNet_v10_t*)dlsym(lib, "ncclCollNetPlugin_v10"); + if (ncclCollNet_v10) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v10)", ncclNet_v10->name); + return ncclCollNet_v10; + } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v10 symbol."); + return nullptr; +} diff --git a/src/plugin/net/net_v6.cc b/src/plugin/net/net_v6.cc new file mode 100644 index 000000000..baff67935 --- /dev/null +++ b/src/plugin/net/net_v6.cc @@ -0,0 +1,178 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "nccl_net.h" +#include "net_device.h" +#include "proxy.h" +#include "checks.h" + +static ncclNet_t ncclNet; +static ncclCollNet_t ncclCollNet; +static ncclNet_v6_t* ncclNet_v6; +static ncclCollNet_v6_t* ncclCollNet_v6; + +static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { + ncclNetProperties_v6_t p6; + ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6); + if (ans != ncclSuccess) return ans; + props->name = p6.name; + props->pciPath = p6.pciPath; + props->guid = p6.guid; + props->ptrSupport = p6.ptrSupport; + props->regIsGlobal = 0; + props->forceFlush = 0; + props->speed = p6.speed; + props->port = p6.port; + props->maxComms = p6.maxComms; + props->maxRecvs = p6.maxRecvs; + props->latency = p6.latency; + props->netDeviceType = NCCL_NET_DEVICE_HOST; + props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + props->maxP2pBytes = MAX_NET_SIZE; + props->maxCollBytes = MAX_COLLNET_SIZE; + return ncclSuccess; +} + +static ncclResult_t ncclNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { + if (size >= 1UL<<31) return ncclInternalError; + return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle); +} + +static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { + return ncclNet_v6->connect(dev, handle, sendComm); +} + +static ncclResult_t ncclNet_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { + return ncclNet_v6->accept(listenComm, recvComm); +} + +static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) { + int sizeInt; + if (size > MAX_NET_SIZE) return ncclInternalError; + sizeInt = (int)size; + ncclResult_t ans = ncclNet_v6->isend(sendComm, data, sizeInt, tag, mhandle, request); + return ans; +} + +static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) { + int sizesInt[NCCL_PROXY_MAX_SUBS]; + //reset to nullptr if optional receive completion is set + if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr; + for (int i=0; i MAX_NET_SIZE) return ncclInternalError; + sizesInt[i] = (int) sizes[i]; + } + ncclResult_t ans = ncclNet_v6->irecv(recvComm, n, data, sizesInt, tags, mhandles, request); + return ans; +} + +static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) { + ncclNetProperties_v6_t p6; + ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6); + if (ans != ncclSuccess) return ans; + props->name = p6.name; + props->pciPath = p6.pciPath; + props->guid = p6.guid; + props->ptrSupport = p6.ptrSupport; + props->regIsGlobal = 0; + props->forceFlush = 0; + props->speed = p6.speed; + props->port = p6.port; + props->maxComms = p6.maxComms; + props->maxRecvs = p6.maxRecvs; + props->latency = p6.latency; + props->netDeviceType = NCCL_NET_DEVICE_HOST; + props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + props->maxP2pBytes = MAX_NET_SIZE; + props->maxCollBytes = MAX_COLLNET_SIZE; + return ncclSuccess; +} + +static ncclResult_t ncclCollNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { + if (size >= 1UL<<31) return ncclInternalError; + return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle); +} + +static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* recvData, size_t count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { + int countInt; + if (count > MAX_NET_SIZE) return ncclInternalError; + countInt = (int)count; + ncclResult_t ans = ncclCollNet_v6->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp, + sendMhandle, recvMhandle, request); + return ans; +} + +static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) { + NCCLCHECK(ncclNet_v6->init(logfn)); + ncclNet.devices = ncclNet_v6->devices; + ncclNet.getProperties = ncclNet_getProperties; + ncclNet.listen = ncclNet_v6->listen; + ncclNet.connect = ncclNet_connect; + ncclNet.accept = ncclNet_accept; + ncclNet.regMr = ncclNet_regMr; + ncclNet.regMrDmaBuf = ncclNet_v6->regMrDmaBuf; + ncclNet.deregMr = ncclNet_v6->deregMr; + ncclNet.isend = ncclNet_isend; + ncclNet.irecv = ncclNet_irecv; + ncclNet.iflush = ncclNet_v6->iflush; + ncclNet.test = ncclNet_v6->test; + ncclNet.closeSend = ncclNet_v6->closeSend; + ncclNet.closeRecv = ncclNet_v6->closeRecv; + ncclNet.closeListen = ncclNet_v6->closeListen; + ncclNet.getDeviceMr = NULL; + ncclNet.irecvConsumed = NULL; + ncclNet.makeVDevice = NULL; + return ncclSuccess; +} + +ncclNet_t* getNcclNet_v6(void* lib) { + ncclNet_v6 = (ncclNet_v6_t*)dlsym(lib, "ncclNetPlugin_v6"); + if (ncclNet_v6) { + ncclNet.name = ncclNet_v6->name; + ncclNet.init = ncclNet_init; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNet_v6->name); + return &ncclNet; + } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v6 symbol."); + return nullptr; +} + +static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) { + NCCLCHECK(ncclCollNet_v6->init(logfn)); + ncclCollNet.devices = ncclCollNet_v6->devices; + ncclCollNet.getProperties = ncclCollNet_getProperties; + ncclCollNet.listen = ncclCollNet_v6->listen; + ncclCollNet.connect = ncclCollNet_v6->connect; + ncclCollNet.reduceSupport = ncclCollNet_v6->reduceSupport; + ncclCollNet.regMr = ncclCollNet_regMr; + ncclCollNet.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf; + ncclCollNet.deregMr = ncclCollNet_v6->deregMr; + ncclCollNet.iallreduce = ncclCollNet_iallreduce; + ncclCollNet.iallgather = nullptr; + ncclCollNet.ireducescatter = nullptr; + ncclCollNet.iflush = ncclCollNet_v6->iflush; + ncclCollNet.test = ncclCollNet_v6->test; + ncclCollNet.closeColl = ncclCollNet_v6->closeColl; + ncclCollNet.closeListen = ncclCollNet_v6->closeListen; + return ncclSuccess; +} + +ncclCollNet_t* getNcclCollNet_v6(void* lib) { + ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(lib, "ncclCollNetPlugin_v6"); + if (ncclCollNet_v6) { + ncclCollNet.name = ncclCollNet_v6->name; + ncclCollNet.init = ncclCollNet_init; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNet_v6->name); + return &ncclCollNet; + } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol."); + return nullptr; +} diff --git a/src/plugin/net/net_v7.cc b/src/plugin/net/net_v7.cc new file mode 100644 index 000000000..4bad5ec26 --- /dev/null +++ b/src/plugin/net/net_v7.cc @@ -0,0 +1,174 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "nccl_net.h" +#include "net_device.h" +#include "proxy.h" +#include "checks.h" + +static ncclNet_t ncclNet; +static ncclCollNet_t ncclCollNet; +static ncclNet_v7_t* ncclNet_v7; +static ncclCollNet_v7_t* ncclCollNet_v7; + +static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { + ncclNetProperties_v7_t p7; + ncclResult_t ans = ncclNet_v7->getProperties(dev, &p7); + if (ans != ncclSuccess) return ans; + props->name = p7.name; + props->pciPath = p7.pciPath; + props->guid = p7.guid; + props->ptrSupport = p7.ptrSupport; + props->regIsGlobal = 0; + props->forceFlush = 0; + props->speed = p7.speed; + props->port = p7.port; + props->maxComms = p7.maxComms; + props->maxRecvs = p7.maxRecvs; + props->latency = p7.latency; + props->netDeviceType = p7.netDeviceType; + props->netDeviceVersion = p7.netDeviceVersion; + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + props->maxP2pBytes = MAX_NET_SIZE; + props->maxCollBytes = MAX_COLLNET_SIZE; + return ncclSuccess; +} + +static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { + return ncclNet_v7->connect(dev, handle, sendComm, sendDevComm); +} + +static ncclResult_t ncclNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { + if (size >= 1UL<<31) return ncclInternalError; + return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle); +} + +static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) { + int sizeInt; + if (size > MAX_NET_SIZE) return ncclInternalError; + sizeInt = (int)size; + ncclResult_t ans = ncclNet_v7->isend(sendComm, data, sizeInt, tag, mhandle, request); + return ans; +} + +static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) { + int sizesInt[NCCL_PROXY_MAX_SUBS]; + //reset to nullptr if optional receive completion is set + if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr; + for (int i=0; i MAX_NET_SIZE) return ncclInternalError; + sizesInt[i] = (int) sizes[i]; + } + ncclResult_t ans = ncclNet_v7->irecv(recvComm, n, data, sizesInt, tags, mhandles, request); + return ans; +} + +static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) { + ncclNetProperties_v7_t p7; + ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7); + if (ans != ncclSuccess) return ans; + props->name = p7.name; + props->pciPath = p7.pciPath; + props->guid = p7.guid; + props->ptrSupport = p7.ptrSupport; + props->regIsGlobal = 0; + props->forceFlush = 0; + props->speed = p7.speed; + props->port = p7.port; + props->maxComms = p7.maxComms; + props->maxRecvs = p7.maxRecvs; + props->latency = p7.latency; + props->netDeviceType = NCCL_NET_DEVICE_HOST; + props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + props->maxP2pBytes = MAX_NET_SIZE; + props->maxCollBytes = MAX_COLLNET_SIZE; + return ncclSuccess; +} + +static ncclResult_t ncclCollNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { + if (size >= 1UL<<31) return ncclInternalError; + return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle); +} + +static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* recvData, size_t count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { + int countInt; + if (count > MAX_NET_SIZE) return ncclInternalError; + countInt = (int)count; + ncclResult_t ans = ncclCollNet_v7->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp, + sendMhandle, recvMhandle, request); + return ans; +} + +static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) { + NCCLCHECK(ncclNet_v7->init(logfn)); + ncclNet.devices = ncclNet_v7->devices; + ncclNet.getProperties = ncclNet_getProperties; // ncclNet_v5->getProperties; + ncclNet.listen = ncclNet_v7->listen; + ncclNet.connect = ncclNet_connect; + ncclNet.accept = ncclNet_v7->accept; + ncclNet.regMr = ncclNet_regMr; + ncclNet.regMrDmaBuf = ncclNet_v7->regMrDmaBuf; + ncclNet.deregMr = ncclNet_v7->deregMr; + ncclNet.isend = ncclNet_isend; + ncclNet.irecv = ncclNet_irecv; + ncclNet.iflush = ncclNet_v7->iflush; + ncclNet.test = ncclNet_v7->test; + ncclNet.closeSend = ncclNet_v7->closeSend; + ncclNet.closeRecv = ncclNet_v7->closeRecv; + ncclNet.closeListen = ncclNet_v7->closeListen; + ncclNet.getDeviceMr = ncclNet_v7->getDeviceMr; + ncclNet.irecvConsumed = ncclNet_v7->irecvConsumed; + ncclNet.makeVDevice = NULL; + return ncclSuccess; +} + +ncclNet_t* getNcclNet_v7(void* lib) { + ncclNet_v7 = (ncclNet_v7_t*)dlsym(lib, "ncclNetPlugin_v7"); + if (ncclNet_v7) { + ncclNet.name = ncclNet_v7->name; + ncclNet.init = ncclNet_init; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNet_v7->name); + return &ncclNet; + } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v7 symbol."); + return nullptr; +} + +static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) { + NCCLCHECK(ncclCollNet_v7->init(logfn)); + ncclCollNet.devices = ncclCollNet_v7->devices; + ncclCollNet.getProperties = ncclCollNet_getProperties; + ncclCollNet.listen = ncclCollNet_v7->listen; + ncclCollNet.connect = ncclCollNet_v7->connect; + ncclCollNet.reduceSupport = ncclCollNet_v7->reduceSupport; + ncclCollNet.regMr = ncclCollNet_regMr; + ncclCollNet.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf; + ncclCollNet.deregMr = ncclCollNet_v7->deregMr; + ncclCollNet.iallreduce = ncclCollNet_iallreduce; + ncclCollNet.iallgather = nullptr; + ncclCollNet.ireducescatter = nullptr; + ncclCollNet.iflush = ncclCollNet_v7->iflush; + ncclCollNet.test = ncclCollNet_v7->test; + ncclCollNet.closeColl = ncclCollNet_v7->closeColl; + ncclCollNet.closeListen = ncclCollNet_v7->closeListen; + return ncclSuccess; +} + +ncclCollNet_t* getNcclCollNet_v7(void* lib) { + ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(lib, "ncclCollNetPlugin_v7"); + if (ncclCollNet_v7) { + ncclCollNet.name = ncclCollNet_v7->name; + ncclCollNet.init = ncclCollNet_init; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNet_v7->name); + return &ncclCollNet; + } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v7 symbol."); + return nullptr; +} diff --git a/src/plugin/net/net_v8.cc b/src/plugin/net/net_v8.cc new file mode 100644 index 000000000..b43bb895e --- /dev/null +++ b/src/plugin/net/net_v8.cc @@ -0,0 +1,196 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "nccl_net.h" +#include "net_device.h" +#include "proxy.h" +#include "checks.h" + +static ncclNet_t ncclNet; +static ncclCollNet_t ncclCollNet; +static ncclNet_v8_t* ncclNet_v8; +static ncclCollNet_v8_t* ncclCollNet_v8; + +static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { + ncclNetProperties_v8_t p8; + ncclResult_t ans = ncclNet_v8->getProperties(dev, &p8); + if (ans != ncclSuccess) return ans; + props->name = p8.name; + props->pciPath = p8.pciPath; + props->guid = p8.guid; + props->ptrSupport = p8.ptrSupport; + props->regIsGlobal = p8.regIsGlobal; + props->forceFlush = 0; + props->speed = p8.speed; + props->port = p8.port; + props->maxComms = p8.maxComms; + props->maxRecvs = p8.maxRecvs; + props->latency = p8.latency; + props->netDeviceType = p8.netDeviceType; + props->netDeviceVersion = p8.netDeviceVersion; + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + props->maxP2pBytes = MAX_NET_SIZE; + props->maxCollBytes = MAX_COLLNET_SIZE; + return ncclSuccess; +} + +static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { + return ncclNet_v8->connect(dev, handle, sendComm, sendDevComm); +} + +static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) { + int sizeInt; + if (size > MAX_NET_SIZE) return ncclInternalError; + sizeInt = (int)size; + ncclResult_t ans = ncclNet_v8->isend(sendComm, data, sizeInt, tag, mhandle, request); + return ans; +} + +static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) { + int sizesInt[NCCL_PROXY_MAX_SUBS]; + //reset to nullptr if optional receive completion is set + if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr; + for (int i=0; i MAX_NET_SIZE) return ncclInternalError; + sizesInt[i] = (int) sizes[i]; + } + ncclResult_t ans = ncclNet_v8->irecv(recvComm, n, data, sizesInt, tags, mhandles, request); + return ans; +} + +static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) { + ncclNetProperties_v8_t p8; + ncclResult_t ans = ncclCollNet_v8->getProperties(dev, &p8); + if (ans != ncclSuccess) return ans; + props->name = p8.name; + props->pciPath = p8.pciPath; + props->guid = p8.guid; + props->ptrSupport = p8.ptrSupport; + props->regIsGlobal = p8.regIsGlobal; + props->forceFlush = 0; + props->speed = p8.speed; + props->port = p8.port; + props->maxComms = p8.maxComms; + props->maxRecvs = p8.maxRecvs; + props->latency = p8.latency; + props->netDeviceType = NCCL_NET_DEVICE_HOST; + props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + props->maxP2pBytes = MAX_NET_SIZE; + props->maxCollBytes = MAX_COLLNET_SIZE; + return ncclSuccess; +} + +static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* recvData, size_t count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { + int countInt; + if (count > MAX_NET_SIZE) return ncclInternalError; + countInt = (int)count; + ncclResult_t ans = ncclCollNet_v8->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp, + sendMhandle, recvMhandle, request); + return ans; +} + +static ncclResult_t ncclCollNet_iallgather (void* collComm, void* sendData, int nRecvParts, ncclNetSGE_t* recvParts, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + void* sendMhandle, void** request) { + ncclNetSGE_v8_t recvPartsInt; + if (nRecvParts > 1) return ncclInternalError; + if (recvParts->size > MAX_COLLNET_SIZE) return ncclInternalError; + recvPartsInt.mhandle = recvParts->mhandle; + recvPartsInt.address = recvParts->address; + recvPartsInt.size = (int)recvParts->size; + ncclResult_t ans = ncclCollNet_v8->iallgather(collComm, sendData, nRecvParts, &recvPartsInt, + bytesPerRank, windowOffset, windowBytes, + sendMhandle, request); + return ans; +} + +static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_t* sendParts, void* recvData, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + ncclDataType_t dataType, ncclRedOp_t redOp, + void* recvMhandle, void** request) { + ncclNetSGE_v8_t sendPartsInt; + if (nSendParts > 1) return ncclInternalError; + if (sendParts->size > MAX_COLLNET_SIZE) return ncclInternalError; + sendPartsInt.mhandle = sendParts->mhandle; + sendPartsInt.address = sendParts->address; + sendPartsInt.size = (int)sendParts->size; + ncclResult_t ans = ncclCollNet_v8->ireducescatter(collComm, nSendParts, &sendPartsInt, + recvData, bytesPerRank, windowOffset, windowBytes, + dataType, redOp, + recvMhandle, request); + return ans; +} + +static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) { + NCCLCHECK(ncclNet_v8->init(logfn)); + ncclNet.devices = ncclNet_v8->devices; + ncclNet.getProperties = ncclNet_getProperties; + ncclNet.listen = ncclNet_v8->listen; + ncclNet.connect = ncclNet_connect; + ncclNet.accept = ncclNet_v8->accept; + ncclNet.regMr = ncclNet_v8->regMr; + ncclNet.regMrDmaBuf = ncclNet_v8->regMrDmaBuf; + ncclNet.deregMr = ncclNet_v8->deregMr; + ncclNet.isend = ncclNet_isend; + ncclNet.irecv = ncclNet_irecv; + ncclNet.iflush = ncclNet_v8->iflush; + ncclNet.test = ncclNet_v8->test; + ncclNet.closeSend = ncclNet_v8->closeSend; + ncclNet.closeRecv = ncclNet_v8->closeRecv; + ncclNet.closeListen = ncclNet_v8->closeListen; + ncclNet.getDeviceMr = ncclNet_v8->getDeviceMr; + ncclNet.irecvConsumed = ncclNet_v8->irecvConsumed; + ncclNet.makeVDevice = NULL; + return ncclSuccess; +} + +ncclNet_t* getNcclNet_v8(void* lib) { + ncclNet_v8 = (ncclNet_v8_t*)dlsym(lib, "ncclNetPlugin_v8"); + if (ncclNet_v8) { + ncclNet.name = ncclNet_v8->name; + ncclNet.init = ncclNet_init; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v8)", ncclNet_v8->name); + return &ncclNet; + } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v8 symbol."); + return nullptr; +} + +static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) { + NCCLCHECK(ncclCollNet_v8->init(logfn)); + ncclCollNet.devices = ncclCollNet_v8->devices; + ncclCollNet.getProperties = ncclCollNet_getProperties; + ncclCollNet.listen = ncclCollNet_v8->listen; + ncclCollNet.connect = ncclCollNet_v8->connect; + ncclCollNet.reduceSupport = ncclCollNet_v8->reduceSupport; + ncclCollNet.regMr = ncclCollNet_v8->regMr; + ncclCollNet.regMrDmaBuf = ncclCollNet_v8->regMrDmaBuf; + ncclCollNet.deregMr = ncclCollNet_v8->deregMr; + ncclCollNet.iallreduce = ncclCollNet_iallreduce; + ncclCollNet.iallgather = ncclCollNet_iallgather; + ncclCollNet.ireducescatter = ncclCollNet_ireducescatter; + ncclCollNet.iflush = ncclCollNet_v8->iflush; + ncclCollNet.test = ncclCollNet_v8->test; + ncclCollNet.closeColl = ncclCollNet_v8->closeColl; + ncclCollNet.closeListen = ncclCollNet_v8->closeListen; + return ncclSuccess; +} + +ncclCollNet_t* getNcclCollNet_v8(void* lib) { + ncclCollNet_v8 = (ncclCollNet_v8_t*)dlsym(lib, "ncclCollNetPlugin_v8"); + if (ncclCollNet_v8) { + ncclCollNet.name = ncclCollNet_v8->name; + ncclCollNet.init = ncclCollNet_init; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v8)", ncclCollNet_v8->name); + return &ncclCollNet; + } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol."); + return nullptr; +} diff --git a/src/plugin/net/net_v9.cc b/src/plugin/net/net_v9.cc new file mode 100644 index 000000000..34e039332 --- /dev/null +++ b/src/plugin/net/net_v9.cc @@ -0,0 +1,121 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "nccl_net.h" +#include "net_device.h" +#include "proxy.h" +#include "checks.h" + +static ncclNet_t ncclNet; +static ncclCollNet_t ncclCollNet; +static ncclNet_v9_t* ncclNet_v9; +static ncclCollNet_v9_t* ncclCollNet_v9; + +static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { + return ncclNet_v9->getProperties(dev, (ncclNetProperties_v9_t *)props); +} + +static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) { + return ncclNet_v9->isend(sendComm, data, size, tag, mhandle, request); +} + +static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) { + return ncclNet_v9->irecv(recvComm, n, data, sizes, tags, mhandles, request); +} + +static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { + return ncclNet_v9->connect(dev, handle, sendComm, sendDevComm); +} + +static ncclResult_t ncclNet_makeVDevice(int* d, ncclNetVDeviceProps_t* props) { + return ncclNet_v9->makeVDevice(d, (ncclNetVDeviceProps_v9_t*)props); +} + +static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) { + return ncclCollNet_v9->getProperties(dev, (ncclNetProperties_v9_t *)props); +} + +static ncclResult_t ncclCollNet_iallgather(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_t* recvParts, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + void* sendMhandle, void** request) { + return ncclCollNet_v9->iallgather(collComm, sendData, nRecvParts, (ncclNetSGE_v9_t*)recvParts, bytesPerRank, + windowOffset, windowBytes, sendMhandle, request); +} + +static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_t* sendParts, void* recvData, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + ncclDataType_t dataType, ncclRedOp_t redOp, + void* recvMhandle, void** request) { + return ncclCollNet_v9->ireducescatter(collComm, nSendParts, (ncclNetSGE_v9_t*)sendParts, recvData, bytesPerRank, + windowOffset, windowBytes, dataType, redOp, recvMhandle, request); +} + +static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) { + NCCLCHECK(ncclNet_v9->init(logfn)); + ncclNet.devices = ncclNet_v9->devices; + ncclNet.getProperties = ncclNet_getProperties; + ncclNet.listen = ncclNet_v9->listen; + ncclNet.connect = ncclNet_connect; + ncclNet.accept = ncclNet_v9->accept; + ncclNet.regMr = ncclNet_v9->regMr; + ncclNet.regMrDmaBuf = ncclNet_v9->regMrDmaBuf; + ncclNet.deregMr = ncclNet_v9->deregMr; + ncclNet.isend = ncclNet_isend; + ncclNet.irecv = ncclNet_irecv; + ncclNet.iflush = ncclNet_v9->iflush; + ncclNet.test = ncclNet_v9->test; + ncclNet.closeSend = ncclNet_v9->closeSend; + ncclNet.closeRecv = ncclNet_v9->closeRecv; + ncclNet.closeListen = ncclNet_v9->closeListen; + ncclNet.getDeviceMr = ncclNet_v9->getDeviceMr; + ncclNet.irecvConsumed = ncclNet_v9->irecvConsumed; + ncclNet.makeVDevice = (ncclNet_v9->makeVDevice) ? ncclNet_makeVDevice : nullptr; + return ncclSuccess; +} + +ncclNet_t* getNcclNet_v9(void* lib) { + ncclNet_v9 = (ncclNet_v9_t*)dlsym(lib, "ncclNetPlugin_v9"); + if (ncclNet_v9) { + ncclNet.name = ncclNet_v9->name; + ncclNet.init = ncclNet_init; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v9)", ncclNet_v9->name); + return &ncclNet; + } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v9 symbol."); + return nullptr; +} + +static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) { + NCCLCHECK(ncclCollNet_v9->init(logfn)); + ncclCollNet.devices = ncclCollNet_v9->devices; + ncclCollNet.getProperties = ncclCollNet_getProperties; + ncclCollNet.listen = ncclCollNet_v9->listen; + ncclCollNet.connect = ncclCollNet_v9->connect; + ncclCollNet.reduceSupport = ncclCollNet_v9->reduceSupport; + ncclCollNet.regMr = ncclCollNet_v9->regMr; + ncclCollNet.regMrDmaBuf = ncclCollNet_v9->regMrDmaBuf; + ncclCollNet.deregMr = ncclCollNet_v9->deregMr; + ncclCollNet.iallreduce = ncclCollNet_v9->iallreduce; + ncclCollNet.iallgather = ncclCollNet_iallgather; + ncclCollNet.ireducescatter = ncclCollNet_ireducescatter; + ncclCollNet.iflush = ncclCollNet_v9->iflush; + ncclCollNet.test = ncclCollNet_v9->test; + ncclCollNet.closeColl = ncclCollNet_v9->closeColl; + ncclCollNet.closeListen = ncclCollNet_v9->closeListen; + return ncclSuccess; +} + +ncclCollNet_t* getNcclCollNet_v9(void* lib) { + ncclCollNet_v9 = (ncclCollNet_v9_t*)dlsym(lib, "ncclCollNetPlugin_v9"); + if (ncclCollNet_v9) { + ncclCollNet.name = ncclCollNet_v9->name; + ncclCollNet.init = ncclCollNet_init; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v9)", ncclCollNet_v9->name); + return &ncclCollNet; + } + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v9 symbol."); + return nullptr; +} diff --git a/src/plugin/plugin_open.cc b/src/plugin/plugin_open.cc new file mode 100644 index 000000000..a43df28d3 --- /dev/null +++ b/src/plugin/plugin_open.cc @@ -0,0 +1,134 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include +#include +#include + +#include "debug.h" + +#define MAX_STR_LEN 255 + +enum ncclPluginType { + ncclPluginTypeNet, + ncclPluginTypeTuner, + ncclPluginTypeProfiler, +}; + +#define NUM_LIBS 3 +static void *libHandles[NUM_LIBS]; +static const char *pluginNames[NUM_LIBS] = { "NET", "TUNER", "PROFILER" }; +static const char *pluginPrefix[NUM_LIBS] = { "libnccl-net", "libnccl-tuner", "libnccl-profiler" }; +static const char *pluginFallback[NUM_LIBS] = { "Using internal net plugin.", "Using internal tuner plugin.", "" }; +static unsigned long subsys[NUM_LIBS] = { NCCL_INIT|NCCL_NET, NCCL_INIT|NCCL_TUNING, NCCL_INIT }; + +static void* tryOpenLib(char* name, int* err, char* errStr) { + *err = 0; + if (nullptr == name || strlen(name) == 0) { + return nullptr; + } + + if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) { + name = nullptr; + } + + void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL); + if (nullptr == handle) { + strncpy(errStr, dlerror(), MAX_STR_LEN); + errStr[MAX_STR_LEN] = '\0'; + // "handle" and "name" won't be NULL at the same time. + // coverity[var_deref_model] + if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) { + *err = ENOENT; + } + } + return handle; +} + +static void appendNameToList(char* nameList, int *nameListLen, char* name) { + snprintf(nameList, *nameListLen, " %s", name); + nameList += strlen(name) + 1; + *nameListLen -= strlen(name) + 1; +} + +static void* openPluginLib(enum ncclPluginType type, const char* libName) { + int openErr, len = PATH_MAX; + char libName_[MAX_STR_LEN] = { 0 }; + char openErrStr[MAX_STR_LEN + 1] = { 0 }; + char eNoEntNameList[PATH_MAX] = { 0 }; + + if (libName && strlen(libName)) { + snprintf(libName_, MAX_STR_LEN, "%s", libName); + libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); + if (libHandles[type]) { + INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_); + return libHandles[type]; + } + if (openErr == ENOENT) { + appendNameToList(eNoEntNameList, &len, libName_); + } else { + INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr); + } + + snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName); + libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); + if (libHandles[type]) { + INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_); + return libHandles[type]; + } + if (openErr == ENOENT) { + appendNameToList(eNoEntNameList, &len, libName_); + } else { + INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr); + } + } else { + snprintf(libName_, MAX_STR_LEN, "%s.so", pluginPrefix[type]); + libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); + if (libHandles[type]) { + return libHandles[type]; + } + if (openErr == ENOENT) { + appendNameToList(eNoEntNameList, &len, libName_); + } else { + INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr); + } + } + + if (strlen(eNoEntNameList)) { + INFO(subsys[type], "%s/Plugin: Could not find:%s. %s", pluginNames[type], eNoEntNameList, pluginFallback[type]); + } else if (strlen(pluginFallback[type])) { + INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], pluginFallback[type]); + } + return nullptr; +} + +void* ncclOpenNetPluginLib(const char* name) { + return openPluginLib(ncclPluginTypeNet, name); +} + +void* ncclOpenTunerPluginLib(const char* name) { + return openPluginLib(ncclPluginTypeTuner, name); +} + +void* ncclOpenProfilerPluginLib(const char* name) { + return openPluginLib(ncclPluginTypeProfiler, name); +} + +void* ncclGetNetPluginLib(void) { + return libHandles[ncclPluginTypeNet]; +} + +ncclResult_t ncclClosePluginLib(void* handle) { + for (int l=0; ltype; - eDescr_v1.parentObj = eDescr->parentObj; - eDescr_v1.rank = eDescr->rank; - switch(eDescr->type) { - case ncclProfileGroup: break; - case ncclProfileColl: { - eDescr_v1.coll.name = eDescr->coll.name; - eDescr_v1.coll.commHash = eDescr->coll.commHash; - eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber; - eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func); - eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff; - eDescr_v1.coll.recvBuff = eDescr->coll.recvBuff; - eDescr_v1.coll.count = eDescr->coll.count; - eDescr_v1.coll.root = eDescr->coll.root; - eDescr_v1.coll.datatype = ncclStringToDatatype(eDescr->coll.datatype); - eDescr_v1.coll.op = 0; // removed in v2 - eDescr_v1.coll.trafficBytes = eDescr->coll.trafficBytes; - eDescr_v1.coll.nMaxChannels = eDescr->coll.nMaxChannels; - eDescr_v1.coll.nWarps = eDescr->coll.nWarps; - eDescr_v1.coll.algo = ncclStringToAlgo(eDescr->coll.algo); - eDescr_v1.coll.proto = ncclStringToProto(eDescr->coll.proto); - } break; - case ncclProfileP2p: { - eDescr_v1.p2p.name = eDescr->p2p.name; - eDescr_v1.p2p.commHash = eDescr->p2p.commHash; - eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func); - eDescr_v1.p2p.buff = eDescr->p2p.buff; - eDescr_v1.p2p.count = eDescr->p2p.count; - eDescr_v1.p2p.datatype = ncclStringToDatatype(eDescr->p2p.datatype); - eDescr_v1.p2p.peer = eDescr->p2p.peer; - } break; - case ncclProfileProxyOp: { - eDescr_v1.proxyOp.pid = eDescr->proxyOp.pid; - eDescr_v1.proxyOp.channelId = eDescr->proxyOp.channelId; - eDescr_v1.proxyOp.peer = eDescr->proxyOp.peer; - eDescr_v1.proxyOp.nSteps = eDescr->proxyOp.nSteps; - eDescr_v1.proxyOp.chunkSize = eDescr->proxyOp.chunkSize; - eDescr_v1.proxyOp.isSend = eDescr->proxyOp.isSend; - } break; - case ncclProfileProxyStep: { - eDescr_v1.proxyStep.step = eDescr->proxyStep.step; - } break; - case ncclProfileProxyCtrl: break; - default:; - } - return ncclProfiler_v1->startEvent(context, eHandle, &eDescr_v1); -} - -static ncclResult_t ncclProfiler_v1_as_v2_init(void** context, int* eActivationMask) { - ncclProfiler_v1->init(context, eActivationMask); - ncclProfiler_v1_as_v2.startEvent = ncclProfiler_v1_as_v2_startEvent; - ncclProfiler_v1_as_v2.stopEvent = ncclProfiler_v1->stopEvent; - ncclProfiler_v1_as_v2.recordEventState = ncclProfiler_v1->recordEventState; - ncclProfiler_v1_as_v2.finalize = ncclProfiler_v1->finalize; - return ncclSuccess; -} #define MAX_STR_LEN 256 -static void* tryOpenLib(char* name, int *err, char* errStr) { - if (nullptr == name || strlen(name) == 0) { - return nullptr; - } - - if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) { - name = nullptr; - } - - void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL); - if (nullptr == handle) { - strncpy(errStr, dlerror(), MAX_STR_LEN); - errStr[MAX_STR_LEN] = 0; - if (name && strstr(errStr, name) && strstr(errStr, "No such file or directory")) { - *err = ENOENT; - } - } - - return handle; -} - -static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) { - if (openErr == ENOENT) { - snprintf(nameList, *nameListLen, " %s", name); - nameList += strlen(name) + 1; - *nameListLen -= strlen(name) + 1; - return nameList; - } - INFO(NCCL_ENV, "PROFILER/Plugin: %s", openErrStr); - return nameList; -} - -static void* openProfilerPluginLib(char* couldNotFindNames, int len) { - int openErr; - void *pluginLib; - char profilerPluginLibName[PATH_MAX]; - char openErrStr[MAX_STR_LEN + 1] = { 0 }; - - const char *envProfilerPluginName = getenv("NCCL_PROFILER_PLUGIN"); - if (envProfilerPluginName && strlen(envProfilerPluginName)) { - snprintf(profilerPluginLibName, PATH_MAX, "%s", envProfilerPluginName); - pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr); - if (pluginLib) { - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: Plugin name set by env to %s", profilerPluginLibName); - return pluginLib; - } - - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName); - pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr); - if (pluginLib) { - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: Plugin name set by env to %s", profilerPluginLibName); - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName); - } else { - snprintf(profilerPluginLibName, PATH_MAX, "libnccl-profiler.so"); - pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr); - if (pluginLib) { - return pluginLib; - } - couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName); - } - - return nullptr; -} - enum { profilerPluginLoadFailed = -1, profilerPluginLoadReady = 0, @@ -195,43 +33,31 @@ enum { static int profilerPluginStatus = profilerPluginLoadReady; static pid_t pid; -#define MAX_PLUGIN_LOAD 2 - static ncclResult_t ncclProfilerPluginLoad(void) { if (profilerPluginLoadFailed == profilerPluginStatus) { return ncclSuccess; } - char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 }; pthread_mutex_lock(&profilerLock); if (profilerPluginLoadSuccess == profilerPluginStatus) { ++profilerPluginRefCount; goto exit; } - profilerPluginLib = openProfilerPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX); + profilerPluginLib = ncclOpenProfilerPluginLib(ncclGetEnv("NCCL_PROFILER_PLUGIN")); if (profilerPluginLib == nullptr) { - if (strlen(couldNotFindNames)) { - INFO(NCCL_ENV, "PROFILER/Plugin: Could not find:%s.", couldNotFindNames); - } goto fail; } - ncclProfiler = (ncclProfiler_v2_t*)dlsym(profilerPluginLib, "ncclProfiler_v2"); + ncclProfiler = getNcclProfiler_v3(profilerPluginLib); if (ncclProfiler == nullptr) { - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v2."); - ncclProfiler_v1 = (ncclProfiler_v1_t*)dlsym(profilerPluginLib, "ncclProfiler_v1"); - if (ncclProfiler_v1 == nullptr) { - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v1."); - goto fail; - } else { - ncclProfiler = &ncclProfiler_v1_as_v2; - ncclProfiler_v1_as_v2.name = ncclProfiler_v1->name; - ncclProfiler_v1_as_v2.init = ncclProfiler_v1_as_v2_init; - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded ncclProfiler_v1."); - } - } else { - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded ncclProfiler_v2."); + ncclProfiler = getNcclProfiler_v2(profilerPluginLib); + } + if (ncclProfiler == NULL) { + ncclProfiler = getNcclProfiler_v1(profilerPluginLib); + } + if (ncclProfiler == NULL) { + goto fail; } ++profilerPluginRefCount; @@ -247,7 +73,7 @@ static ncclResult_t ncclProfilerPluginLoad(void) { pthread_mutex_unlock(&profilerLock); return ncclSuccess; fail: - if (profilerPluginLib) dlclose(profilerPluginLib); + if (profilerPluginLib) NCCLCHECK(ncclClosePluginLib(profilerPluginLib)); profilerPluginStatus = profilerPluginLoadFailed; goto exit; } @@ -256,7 +82,7 @@ static ncclResult_t ncclProfilerPluginUnload(void) { pthread_mutex_lock(&profilerLock); if (0 == (--profilerPluginRefCount)) { INFO(NCCL_ENV, "PROFILER/Plugin: Closing profiler plugin %s", ncclProfiler->name); - dlclose(profilerPluginLib); + NCCLCHECK(ncclClosePluginLib(profilerPluginLib)); profilerPluginLib = nullptr; ncclProfiler = nullptr; profilerPluginStatus = profilerPluginLoadReady; @@ -269,6 +95,11 @@ static ncclResult_t ncclProfilerPluginUnload(void) { #include "timer.h" #if ENABLE_TIMER +// These counters are used to measure profiler overheads for different part of the code +// These counters are only useful/meaningful in controlled test environments where there +// is only one thread updating each set of counters, i.e., every communicator has its +// own proxy thread and the network uses only one thread to make progress (this is true +// for net_ib plugin but might not be true for net_socket plugin). static int64_t elapsedCount; static int64_t initCount, finalizeCount; static int64_t groupStartCount, groupStopCount; @@ -324,15 +155,14 @@ static double proxyOpRecordTs[2], proxyStepRecordTs[2], proxyCtrlRecordTs[2]; #endif -static int eActivationMask; // Set by profiler -static int eActivationMaskGroup; // Cached for current group +int ncclProfilerEventMask; // Set by profiler ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm) { TIME_START_EVENT(elapsed); TIME_START_EVENT(init); ncclProfilerPluginLoad(); if (__builtin_expect(ncclProfiler != NULL, 0)) { - int err = ncclProfiler->init(&comm->profilerContext, &eActivationMask); + int err = ncclProfiler->init(&comm->profilerContext, &ncclProfilerEventMask); if (err) { WARN("Profiler init failed with error (%d). Continue without profiler.", err); ncclProfiler = NULL; @@ -356,9 +186,29 @@ ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm) { ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan) { TIME_START_EVENT(groupStart); - eActivationMaskGroup = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED); if (__builtin_expect(ncclProfiler != NULL, 0)) { - if (eActivationMaskGroup & (ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep)) { + // Check if any collective in the plan has a set event activation mask + struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue); + struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue); + int eActivationMask_ = 0; + while (ct) { + if (ct->eActivationMask) { + eActivationMask_ = ct->eActivationMask; + goto startGroup; + } + ct = ct->next; + } + // Check if any pt2pt in the plan has a set event activation mask + while (pt) { + if (pt->eActivationMask) { + eActivationMask_ = pt->eActivationMask; + goto startGroup; + } + pt = pt->next; + } + + startGroup: + if (eActivationMask_ & (ncclProfileGroup | ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin)) { ncclProfilerEventDescr_t eDescr = { 0 }; eDescr.type = ncclProfileGroup; ncclProfiler->startEvent(plan->comm->profilerContext, &plan->groupEventHandle, &eDescr); @@ -379,52 +229,63 @@ ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan) { ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) { TIME_START_EVENT(taskStart); - if (__builtin_expect(ncclProfiler != NULL, 0)) { - int enable = eActivationMaskGroup & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileColl); - if (plan->groupEventHandle && enable) { - struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue); - while (ct) { - ncclProfilerEventDescr_t eDescr = { 0 }; - eDescr.type = ncclProfileColl; - eDescr.parentObj = plan->groupEventHandle; - eDescr.rank = plan->comm->rank; - eDescr.coll.name = plan->comm->commName; - eDescr.coll.commHash = plan->comm->commHash; - eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func]++; - eDescr.coll.func = ncclFuncToString(ct->func); - eDescr.coll.sendBuff = ct->sendbuff; - eDescr.coll.recvBuff = ct->recvbuff; - eDescr.coll.count = ct->count; - eDescr.coll.root = ct->root; - eDescr.coll.datatype = ncclDatatypeToString(ct->datatype); - eDescr.coll.trafficBytes = ct->trafficBytes; - eDescr.coll.nMaxChannels = ct->nMaxChannels; - eDescr.coll.nWarps = ct->nWarps; - eDescr.coll.algo = ncclAlgoToString(ct->algorithm); - eDescr.coll.proto = ncclProtoToString(ct->protocol); - ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr); - - // update collective task with group event activation mask - ct->eActivationMask = eActivationMaskGroup; - ct = ct->next; + struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue); + while (ct) { + if (__builtin_expect(ncclProfiler != NULL, 0)) { + if (plan->groupEventHandle) { + int enable = ct->eActivationMask & (ncclProfileColl | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin); + if (enable) { + ncclProfilerEventDescr_t eDescr = { 0 }; + eDescr.type = ncclProfileColl; + eDescr.parentObj = plan->groupEventHandle; + eDescr.rank = plan->comm->rank; + eDescr.coll.name = plan->comm->commName; + eDescr.coll.commHash = plan->comm->commHash; + eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func]; + eDescr.coll.func = ncclFuncToString(ct->func); + eDescr.coll.sendBuff = ct->sendbuff; + eDescr.coll.recvBuff = ct->recvbuff; + eDescr.coll.count = ct->count; + eDescr.coll.root = ct->root; + eDescr.coll.datatype = ncclDatatypeToString(ct->datatype); + eDescr.coll.nMaxChannels = ct->nMaxChannels; + eDescr.coll.nWarps = ct->nWarps; + eDescr.coll.algo = ncclAlgoToString(ct->algorithm); + eDescr.coll.proto = ncclProtoToString(ct->protocol); + ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr); + } } + } + // comm->seqNumber values are updated even if the plugin is not active, since they are used by RAS as well. + // The test for "persistent" is a workaround for graph-captured collectives. In their case this function may not be + // consistently invoked on all the ranks, which would lead to mismatched counter values and thus false-positive + // reports from RAS. Instead, we choose not to include graph-captured collectives in our counts. An exception is + // made if ncclProfileKernelCh profiler events are active, as they result in proxy events always being added, which + // gives the consistency. + if (!plan->persistent || (__builtin_expect(ncclProfiler != NULL, 0) && plan->groupEventHandle && + (ct->eActivationMask & ncclProfileKernelCh))) + plan->comm->seqNumber[ct->func]++; + ct = ct->next; + } + if (__builtin_expect(ncclProfiler != NULL, 0)) { + if (plan->groupEventHandle) { struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue); while (pt) { - ncclProfilerEventDescr_t eDescr = { 0 }; - eDescr.type = ncclProfileP2p; - eDescr.parentObj = plan->groupEventHandle; - eDescr.rank = plan->comm->rank; - eDescr.p2p.name = plan->comm->commName; - eDescr.p2p.commHash = plan->comm->commHash; - eDescr.p2p.func = ncclFuncToString(pt->func); - eDescr.p2p.buff = pt->buff; - eDescr.p2p.count = pt->count; - eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype); - eDescr.p2p.peer = pt->root; - ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr); - - // update collective task with group event activation mask - pt->eActivationMask = eActivationMaskGroup; + int enable = pt->eActivationMask & (ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh); + if (enable) { + ncclProfilerEventDescr_t eDescr = { 0 }; + eDescr.type = ncclProfileP2p; + eDescr.parentObj = plan->groupEventHandle; + eDescr.rank = plan->comm->rank; + eDescr.p2p.name = plan->comm->commName; + eDescr.p2p.commHash = plan->comm->commHash; + eDescr.p2p.func = ncclFuncToString(pt->func); + eDescr.p2p.buff = pt->buff; + eDescr.p2p.count = pt->count; + eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype); + eDescr.p2p.peer = pt->root; + ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr); + } pt = pt->next; } } @@ -436,16 +297,15 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) { ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) { TIME_START_EVENT(taskStop); if (__builtin_expect(ncclProfiler != NULL, 0)) { - int enable = eActivationMaskGroup & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileColl); - if (plan->groupEventHandle && enable) { + if (plan->groupEventHandle) { struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue); while (ct) { - ncclProfiler->stopEvent(ct->eventHandle); + if (ct->eventHandle) ncclProfiler->stopEvent(ct->eventHandle); ct = ct->next; } struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue); while (pt) { - ncclProfiler->stopEvent(pt->eventHandle); + if (pt->eventHandle) ncclProfiler->stopEvent(pt->eventHandle); pt = pt->next; } } @@ -463,7 +323,7 @@ ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args TIME_START_EVENT(proxyOpStart); struct ncclProxySubArgs* sub = &args->subs[s]; if (__builtin_expect(ncclProfiler != NULL, 0)) { - if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileProxyOp)) { + if (sub->eActivationMask & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileNetPlugin)) { ncclProfilerEventDescr_t eDescr = { 0 }; eDescr.type = ncclProfileProxyOp; eDescr.parentObj = sub->taskEventHandle; @@ -485,7 +345,7 @@ ncclResult_t ncclProfilerStartRecvProxyOpEvent(int s, struct ncclProxyArgs* args TIME_START_EVENT(proxyOpStart); struct ncclProxySubArgs* sub = &args->subs[s]; if (__builtin_expect(ncclProfiler != NULL, 0)) { - if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileProxyOp)) { + if (sub->eActivationMask & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileNetPlugin)) { ncclProfilerEventDescr_t eDescr = { 0 }; eDescr.type = ncclProfileProxyOp; eDescr.parentObj = sub->taskEventHandle; @@ -518,7 +378,7 @@ ncclResult_t ncclProfilerStartSendProxyStepEvent(int s, struct ncclProxyArgs* ar TIME_START_EVENT(proxyStepStart); struct ncclProxySubArgs* sub = &args->subs[s]; if (__builtin_expect(ncclProfiler != NULL, 0)) { - if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) { + if (sub->opEventHandle && (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileNetPlugin))) { int step_ = DIVUP(stepId, args->sliceSteps); ncclProfilerEventDescr_t eDescr = { 0 }; eDescr.type = ncclProfileProxyStep; @@ -536,7 +396,7 @@ ncclResult_t ncclProfilerStartRecvProxyStepEvent(int s, struct ncclProxyArgs* ar TIME_START_EVENT(proxyStepStart); struct ncclProxySubArgs* sub = &args->subs[s]; if (__builtin_expect(ncclProfiler != NULL, 0)) { - if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) { + if (sub->opEventHandle && (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileNetPlugin))) { int step_ = DIVUP(stepId, args->sliceSteps); ncclProfilerEventDescr_t eDescr = { 0 }; eDescr.type = ncclProfileProxyStep; @@ -568,7 +428,7 @@ ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHand TIME_START_EVENT(proxyCtrlStart); if (__builtin_expect(ncclProfiler != NULL, 0)) { // for proxy control events we allow profiling mode to change on a per event basis - int eActivationMaskProxy = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED); + int eActivationMaskProxy = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED); if (eActivationMaskProxy & ncclProfileProxyCtrl) { ncclProfilerEventDescr_t eDescr = { 0 }; eDescr.type = ncclProfileProxyCtrl; @@ -591,6 +451,30 @@ ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle) { return ncclSuccess; } +ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s) { + if (__builtin_expect(ncclProfiler != NULL, 0)) { + struct ncclProxySubArgs* sub = &args->subs[s]; + if (sub->eActivationMask & ncclProfileKernelCh) { + ncclProfilerEventDescr_t eDescr = { }; + eDescr.type = ncclProfileKernelCh; + eDescr.parentObj = sub->taskEventHandle; + eDescr.kernelCh.channelId = sub->channelId; + ncclProfiler->startEvent(sub->profilerContext, &sub->kernelEventHandle, &eDescr); + } + } + return ncclSuccess; +} + +ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s) { + if (__builtin_expect(ncclProfiler != NULL, 0)) { + struct ncclProxySubArgs* sub = &args->subs[s]; + if (sub->kernelEventHandle) { + ncclProfiler->stopEvent(sub->kernelEventHandle); + } + } + return ncclSuccess; +} + ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState) { TIME_START_EVENT(proxyOpRecord); struct ncclProxySubArgs* sub = &args->subs[s]; @@ -619,7 +503,7 @@ ncclResult_t ncclProfilerRecordProxyStepEventState(int s, struct ncclProxyArgs* ncclResult_t ncclProfilerRecordProxyCtrlEventState(void* eHandle, int appended, ncclProfilerEventState_t eState) { TIME_START_EVENT(proxyCtrlRecord); - if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle && __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED) & ncclProfileProxyCtrl) { + if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle && __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED) & ncclProfileProxyCtrl) { ncclProfilerEventStateArgs_t args = { }; args.proxyCtrl.appendedProxyOps = appended; ncclProfiler->recordEventState(eHandle, eState, &args); @@ -632,3 +516,47 @@ ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op) { op->pid = pid; return ncclSuccess; } + +static pthread_mutex_t proxyProfilerConnectLock = PTHREAD_MUTEX_INITIALIZER; + +static ncclResult_t proxyProfilerConnect(struct ncclComm* comm, struct ncclProxyOp* op) { + ncclResult_t ret = ncclSuccess; + pthread_mutex_lock(&proxyProfilerConnectLock); + if (comm->profiler.initialized) goto exit; + for (int c = 0; c < MAXCHANNELS; c++) { + NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_PROFILER, 0, comm->rank, &comm->profiler.sendProxyConn[c]), ret, exit); + NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &comm->profiler.sendProxyConn[c], ncclProxyMsgConnect, NULL, 0, NULL, 0), ret, exit); + NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_PROFILER, 0, comm->rank, &comm->profiler.recvProxyConn[c]), ret, exit); + NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &comm->profiler.recvProxyConn[c], ncclProxyMsgConnect, NULL, 0, NULL, 0), ret, exit); + } + comm->profiler.initialized = true; +exit: + pthread_mutex_unlock(&proxyProfilerConnectLock); + return ret; +} + +bool ncclProfilerNeedsProxy(struct ncclComm* comm, struct ncclProxyOp* op) { + bool enabled = (__builtin_expect(ncclProfiler != NULL, 0) && (op->eActivationMask & ncclProfileKernelCh)); + if (enabled && !comm->profiler.initialized) (void)proxyProfilerConnect(comm, op); + return enabled; +} + +ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData) { + if (__builtin_expect(ncclProfiler != NULL, 0)) { + struct ncclProxySubArgs* sub = (struct ncclProxySubArgs*)pHandle; + if (type == 0) { // start + if (sub->eActivationMask & ncclProfileNetPlugin) { + ncclProfilerEventDescr_t eDescr = { 0 }; + eDescr.type = ncclProfileNetPlugin; + eDescr.parentObj = sub->stepEventHandles[sub->profilerSteps%NCCL_STEPS]; + eDescr.rank = sub->rank; + eDescr.netPlugin.id = pluginId; + eDescr.netPlugin.data = extData; + ncclProfiler->startEvent(sub->profilerContext, eHandle, &eDescr); + } + } else { // stop + ncclProfiler->stopEvent(*eHandle); + } + } + return ncclSuccess; +} diff --git a/src/plugin/profiler/profiler_v1.cc b/src/plugin/profiler/profiler_v1.cc new file mode 100644 index 000000000..139742942 --- /dev/null +++ b/src/plugin/profiler/profiler_v1.cc @@ -0,0 +1,133 @@ +/************************************************************************* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "comm.h" +#include "nccl_profiler.h" +#include "checks.h" + +static ncclProfiler_t ncclProfiler; +static ncclProfiler_v1_t* ncclProfiler_v1; + +static uint8_t ncclStringToFunc(const char* func) { + if (0 == strcmp(func, "AllGather")) return ncclFuncAllGather; + if (0 == strcmp(func, "AllReduce")) return ncclFuncAllReduce; + if (0 == strcmp(func, "Broadcast")) return ncclFuncBroadcast; + if (0 == strcmp(func, "Recv")) return ncclFuncRecv; + if (0 == strcmp(func, "Reduce")) return ncclFuncReduce; + if (0 == strcmp(func, "ReduceScatter")) return ncclFuncReduceScatter; + if (0 == strcmp(func, "SendRecv")) return ncclFuncSendRecv; + return ncclFuncSend; +} + +static uint8_t ncclStringToAlgo(const char* algo) { + if (0 == strcmp(algo, "TREE")) return NCCL_ALGO_TREE; + if (0 == strcmp(algo, "RING")) return NCCL_ALGO_RING; + if (0 == strcmp(algo, "COLLNET_DIRECT")) return NCCL_ALGO_COLLNET_DIRECT; + if (0 == strcmp(algo, "COLLNET_CHAIN")) return NCCL_ALGO_COLLNET_CHAIN; + if (0 == strcmp(algo, "NVLS")) return NCCL_ALGO_NVLS; + if (0 == strcmp(algo, "NVLS_TREE")) return NCCL_ALGO_NVLS_TREE; + return NCCL_ALGO_PAT; +} + +static uint8_t ncclStringToProto(const char* proto) { + if (0 == strcmp(proto, "LL")) return NCCL_PROTO_LL; + if (0 == strcmp(proto, "LL128")) return NCCL_PROTO_LL128; + return NCCL_PROTO_SIMPLE; +} + +static uint8_t ncclStringToDatatype(const char* dt) { + if (0 == strcmp(dt, "ncclInt8")) return ncclInt8; + if (0 == strcmp(dt, "ncclInt32")) return ncclInt32; + if (0 == strcmp(dt, "ncclUint32")) return ncclUint32; + if (0 == strcmp(dt, "ncclInt64")) return ncclInt64; + if (0 == strcmp(dt, "ncclUint64")) return ncclUint64; + if (0 == strcmp(dt, "ncclFloat16")) return ncclFloat16; + if (0 == strcmp(dt, "ncclFloat32")) return ncclFloat32; +#if defined(__CUDA_BF16_TYPES_EXIST__) + if (0 == strcmp(dt, "ncclBfloat16")) return ncclBfloat16; +#endif + return ncclFloat64; +} + +static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) { + ncclProfilerEventDescr_v1_t eDescr_v1 = { 0 }; + eDescr_v1.type = eDescr->type; + eDescr_v1.parentObj = eDescr->parentObj; + eDescr_v1.rank = eDescr->rank; + switch(eDescr->type) { + case ncclProfileGroup: break; + case ncclProfileColl: { + eDescr_v1.coll.name = eDescr->coll.name; + eDescr_v1.coll.commHash = eDescr->coll.commHash; + eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber; + eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func); + eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff; + eDescr_v1.coll.recvBuff = eDescr->coll.recvBuff; + eDescr_v1.coll.count = eDescr->coll.count; + eDescr_v1.coll.root = eDescr->coll.root; + eDescr_v1.coll.datatype = ncclStringToDatatype(eDescr->coll.datatype); + eDescr_v1.coll.op = 0; // removed in v2 + eDescr_v1.coll.trafficBytes = 0; // removed in v3 + eDescr_v1.coll.nMaxChannels = eDescr->coll.nMaxChannels; + eDescr_v1.coll.nWarps = eDescr->coll.nWarps; + eDescr_v1.coll.algo = ncclStringToAlgo(eDescr->coll.algo); + eDescr_v1.coll.proto = ncclStringToProto(eDescr->coll.proto); + } break; + case ncclProfileP2p: { + eDescr_v1.p2p.name = eDescr->p2p.name; + eDescr_v1.p2p.commHash = eDescr->p2p.commHash; + eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func); + eDescr_v1.p2p.buff = eDescr->p2p.buff; + eDescr_v1.p2p.count = eDescr->p2p.count; + eDescr_v1.p2p.datatype = ncclStringToDatatype(eDescr->p2p.datatype); + eDescr_v1.p2p.peer = eDescr->p2p.peer; + } break; + case ncclProfileProxyOp: { + eDescr_v1.proxyOp.pid = eDescr->proxyOp.pid; + eDescr_v1.proxyOp.channelId = eDescr->proxyOp.channelId; + eDescr_v1.proxyOp.peer = eDescr->proxyOp.peer; + eDescr_v1.proxyOp.nSteps = eDescr->proxyOp.nSteps; + eDescr_v1.proxyOp.chunkSize = eDescr->proxyOp.chunkSize; + eDescr_v1.proxyOp.isSend = eDescr->proxyOp.isSend; + } break; + case ncclProfileProxyStep: { + eDescr_v1.proxyStep.step = eDescr->proxyStep.step; + } break; + case ncclProfileProxyCtrl: break; + case ncclProfileKernelCh: + case ncclProfileNetPlugin: { + *eHandle = NULL; + return ncclSuccess; + } + default:; + } + return ncclProfiler_v1->startEvent(context, eHandle, &eDescr_v1); +} + +static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) { + return ncclProfiler_v1->recordEventState(eHandle, eState, (ncclProfilerEventStateArgs_v1_t*)eStateArgs); +} + +static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask) { + NCCLCHECK(ncclProfiler_v1->init(context, eActivationMask)); + ncclProfiler.startEvent = ncclProfiler_startEvent; + ncclProfiler.stopEvent = ncclProfiler_v1->stopEvent; + ncclProfiler.recordEventState = ncclProfiler_recordEventState; + ncclProfiler.finalize = ncclProfiler_v1->finalize; + return ncclSuccess; +} + +ncclProfiler_t* getNcclProfiler_v1(void* lib) { + ncclProfiler_v1 = (ncclProfiler_v1_t*)dlsym(lib, "ncclProfiler_v1"); + if (ncclProfiler_v1) { + ncclProfiler.name = ncclProfiler_v1->name; + ncclProfiler.init = ncclProfiler_init; + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v1->name); + return &ncclProfiler; + } + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v1."); + return NULL; +} diff --git a/src/plugin/profiler/profiler_v2.cc b/src/plugin/profiler/profiler_v2.cc new file mode 100644 index 000000000..3d00008a6 --- /dev/null +++ b/src/plugin/profiler/profiler_v2.cc @@ -0,0 +1,45 @@ +/************************************************************************* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "comm.h" +#include "nccl_profiler.h" +#include "checks.h" + +static ncclProfiler_t ncclProfiler; +static ncclProfiler_v2_t* ncclProfiler_v2; + +static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) { + if (eDescr->type == ncclProfileKernelCh || eDescr->type == ncclProfileNetPlugin) { + *eHandle = NULL; + return ncclSuccess; + } + return ncclProfiler_v2->startEvent(context, eHandle, (ncclProfilerEventDescr_v2_t *)eDescr); +} + +static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) { + return ncclProfiler_v2->recordEventState(eHandle, eState, (ncclProfilerEventStateArgs_v2_t *)eStateArgs); +} + +static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask) { + NCCLCHECK(ncclProfiler_v2->init(context, eActivationMask)); + ncclProfiler.startEvent = ncclProfiler_startEvent; + ncclProfiler.stopEvent = ncclProfiler_v2->stopEvent; + ncclProfiler.recordEventState = ncclProfiler_recordEventState; + ncclProfiler.finalize = ncclProfiler_v2->finalize; + return ncclSuccess; +} + +ncclProfiler_t* getNcclProfiler_v2(void* lib) { + ncclProfiler_v2 = (ncclProfiler_v2_t*)dlsym(lib, "ncclProfiler_v2"); + if (ncclProfiler_v2) { + ncclProfiler.name = ncclProfiler_v2->name; + ncclProfiler.init = ncclProfiler_init; + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v2->name); + return &ncclProfiler; + } + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v2"); + return NULL; +} diff --git a/src/plugin/profiler/profiler_v3.cc b/src/plugin/profiler/profiler_v3.cc new file mode 100644 index 000000000..322bea57a --- /dev/null +++ b/src/plugin/profiler/profiler_v3.cc @@ -0,0 +1,20 @@ +/************************************************************************* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "comm.h" +#include "nccl_profiler.h" + +static ncclProfiler_v3_t* ncclProfiler_v3; + +ncclProfiler_t* getNcclProfiler_v3(void* lib) { + ncclProfiler_v3 = (ncclProfiler_v3_t*)dlsym(lib, "ncclProfiler_v3"); + if (ncclProfiler_v3) { + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v3->name); + return ncclProfiler_v3; + } + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v3"); + return NULL; +} diff --git a/src/plugin/tuner.cc b/src/plugin/tuner.cc new file mode 100644 index 000000000..443bf78c4 --- /dev/null +++ b/src/plugin/tuner.cc @@ -0,0 +1,99 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include + +#include "checks.h" +#include "debug.h" +#include "tuner.h" +#include "plugin.h" + +extern ncclTuner_t* getNcclTuner_v2(void* lib); +extern ncclTuner_t* getNcclTuner_v3(void* lib); +extern ncclTuner_t* getNcclTuner_v4(void* lib); + +pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER; +static int tunerPluginRefCount; +static void* tunerPluginLib = nullptr; +static ncclTuner_t* tunerSymbol = nullptr; + +enum { + tunerPluginLoadFailed = -1, + tunerPluginLoadReady = 0, + tunerPluginLoadSuccess = 1, +}; + +#define MAX_PLUGIN_LOAD 4 + +static int status = tunerPluginLoadReady; + +ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) { + // Initialize to nullptr by default if plugin tuner cannot be loaded. + comm->tuner = nullptr; + if (tunerPluginLoadFailed == status) { + return ncclSuccess; + } + + pthread_mutex_lock(&tunerPluginLock); + if (tunerPluginLoadFailed == status) { + goto exit; + } + + if (tunerPluginLoadSuccess == status) { + comm->tuner = tunerSymbol; + ++tunerPluginRefCount; + goto exit; + } + + tunerPluginLib = ncclOpenTunerPluginLib(ncclGetEnv("NCCL_TUNER_PLUGIN")); + if (nullptr == tunerPluginLib) { + tunerPluginLib = ncclGetNetPluginLib(); + if (nullptr == tunerPluginLib) { + goto fail; + } + } + + tunerSymbol = getNcclTuner_v4(tunerPluginLib); + if (tunerSymbol == NULL) { + tunerSymbol = getNcclTuner_v3(tunerPluginLib); + } + if (tunerSymbol == NULL) { + tunerSymbol = getNcclTuner_v2(tunerPluginLib); + } + if (tunerSymbol == NULL) { + goto fail; + } + + comm->tuner = tunerSymbol; + ++tunerPluginRefCount; + status = tunerPluginLoadSuccess; + comm->tunerPluginLoaded = 1; + +exit: + pthread_mutex_unlock(&tunerPluginLock); + return ncclSuccess; +fail: + tunerPluginLib = nullptr; + status = tunerPluginLoadFailed; + goto exit; +} + +ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) { + pthread_mutex_lock(&tunerPluginLock); + if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) { + INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name); + NCCLCHECK(ncclClosePluginLib(tunerPluginLib)); + tunerPluginLib = nullptr; + tunerSymbol = nullptr; + comm->tuner = nullptr; + status = tunerPluginLoadReady; + comm->tunerPluginLoaded = 0; + } + pthread_mutex_unlock(&tunerPluginLock); + return ncclSuccess; +} diff --git a/src/plugin/tuner/tuner_v2.cc b/src/plugin/tuner/tuner_v2.cc new file mode 100644 index 000000000..005638f01 --- /dev/null +++ b/src/plugin/tuner/tuner_v2.cc @@ -0,0 +1,66 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include "debug.h" +#include "checks.h" +#include "nccl_tuner.h" + +static ncclTuner_v2_t* ncclTuner_v2; +static ncclTuner_t ncclTuner; + +static int hasNvlsSupport(float** collCostTable) { + // Requirements for support of different algorithms: + // + // - NVLS intra-node: nvlsSupport + // - NVLS intra+inter-node: collNetSupport + // - NVLSTree intra-node: always disabled + // - NVLSTree inter-node: nvlsSupport + // - Collnet* inter-node: collNetSupport + // + // nvlsSupport = 1 if either NVLS or NVLS_TREE entries in the cost table are not -1 + float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; + return (table[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE || table[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) ? 1 : 0; +} + +static int hasCollNetSupport(float** collCostTable) { + float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; + return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1; +} + +static ncclResult_t ncclTuner_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int regBuff __attribute__((unused)), int* nChannels) { + int algorithm = NCCL_ALGO_UNDEF; + int protocol = NCCL_PROTO_UNDEF; + int nvlsSupport = hasNvlsSupport(collCostTable); + int collNetSupport = hasCollNetSupport(collCostTable); + NCCLCHECK(ncclTuner_v2->getCollInfo(context, collType, nBytes, collNetSupport, nvlsSupport, numPipeOps, &algorithm, &protocol, nChannels)); + // set time to 0 below to make sure this algorithm/protocol is selected later on + if (algorithm >= 0 && algorithm < NCCL_NUM_ALGORITHMS && protocol >= 0 && protocol < NCCL_NUM_PROTOCOLS) { + float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; + if (table[algorithm][protocol] != NCCL_ALGO_PROTO_IGNORE) table[algorithm][protocol] = 0.0; + } + return ncclSuccess; +} + +static ncclResult_t ncclTuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, void** context) { + NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logfn, context)); + ncclTuner.getCollInfo = ncclTuner_getCollInfo; + ncclTuner.destroy = ncclTuner_v2->destroy; + return ncclSuccess; +} + +ncclTuner_t* getNcclTuner_v2(void* lib) { + ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(lib, "ncclTunerPlugin_v2"); + if (ncclTuner_v2) { + ncclTuner.name = ncclTuner_v2->name; + ncclTuner.init = ncclTuner_init; + INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v2->name); + return &ncclTuner; + } + INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead."); + return NULL; +} diff --git a/src/plugin/tuner/tuner_v3.cc b/src/plugin/tuner/tuner_v3.cc new file mode 100644 index 000000000..3898243bc --- /dev/null +++ b/src/plugin/tuner/tuner_v3.cc @@ -0,0 +1,38 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include "debug.h" +#include "checks.h" +#include "nccl_tuner.h" + +static ncclTuner_v3_t* ncclTuner_v3; +static ncclTuner_t ncclTuner; + +static ncclResult_t ncclTuner_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int regBuff __attribute__((unused)), int* nChannels) { + NCCLCHECK(ncclTuner_v3->getCollInfo(context, collType, nBytes, numPipeOps, collCostTable, numAlgo, numProto, nChannels)); + return ncclSuccess; +} + +static ncclResult_t ncclTuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, void** context) { + NCCLCHECK(ncclTuner_v3->init(nRanks, nNodes, logfn, context)); + ncclTuner.getCollInfo = ncclTuner_getCollInfo; + ncclTuner.destroy = ncclTuner_v3->destroy; + return ncclSuccess; +} + +ncclTuner_t* getNcclTuner_v3(void* lib) { + ncclTuner_v3 = (ncclTuner_v3_t*)dlsym(lib, "ncclTunerPlugin_v3"); + if (ncclTuner_v3) { + ncclTuner.name = ncclTuner_v3->name; + ncclTuner.init = ncclTuner_init; + INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v3->name); + return &ncclTuner; + } + INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol."); + return NULL; +} diff --git a/src/plugin/tuner/tuner_v4.cc b/src/plugin/tuner/tuner_v4.cc new file mode 100644 index 000000000..4bfd116bb --- /dev/null +++ b/src/plugin/tuner/tuner_v4.cc @@ -0,0 +1,22 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include "debug.h" +#include "nccl_tuner.h" + +static ncclTuner_v4_t* ncclTuner_v4; + +ncclTuner_t* getNcclTuner_v4(void* lib) { + ncclTuner_v4 = (ncclTuner_v4_t*)dlsym(lib, "ncclTunerPlugin_v4"); + if (ncclTuner_v4) { + INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v4->name); + return ncclTuner_v4; + } + INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol."); + return NULL; +} diff --git a/src/proxy.cc b/src/proxy.cc index 5a83ef3eb..7e8021e47 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -383,6 +383,7 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr sub->pid = op->pid; sub->profilerContext = op->profilerContext; sub->ringAlgo = op->ringAlgo; + sub->workCounter = op->workCounter; args->nsubs = subIndex+1; if (subIndex) { if ((args->sliceSteps != op->sliceSteps) || @@ -532,6 +533,19 @@ static ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyCon return ncclSuccess; } +static ncclResult_t SaveProxyProfiler(struct ncclComm* comm, struct ncclProxyOp* op, bool* justInquire) { + struct ncclProxyConnector* proxyConn = (op->coll == ncclFuncRecv) ? &comm->profiler.recvProxyConn[op->channelId] : &comm->profiler.sendProxyConn[op->channelId]; + if (justInquire) *justInquire = true; + else { + op->sendbuff = (uint8_t *)comm->profiler.workStarted; + op->recvbuff = (uint8_t *)comm->profiler.workCompleted; + NCCLCHECK(ncclLocalOpAppend(comm, proxyConn, op)); + // Ensure that in graph capturing the proxy workCounter is incremented to keep up with kernel workCounter + op->workCounter += comm->profiler.workCounter[op->channelId]; + } + return ncclSuccess; +} + static ncclResult_t SaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex, bool* justInquire) { if (peer < 0) return ncclSuccess; @@ -612,20 +626,19 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool // Run full algorithm to count the number of steps for each peer. ncclResult_t result = ncclSuccess; const ssize_t size = op->nbytes/comm->nRanks; - int last = 0; - int *nstepsSend = NULL, *nstepsRecv = NULL; const int rank = comm->rank, nranks = comm->nRanks; - PatRSAlgorithm algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks); + int *nstepsSend = NULL, *nstepsRecv = NULL; + PatRSAlgorithm algo(op->chunkSize, NCCL_STEPS, 16, 0, size, size, op->chunkSize, rank, nranks); NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_up); NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_up); - while (last == 0) { - int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem; - size_t inpIx, outIx; - algo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last); - if (recvDim != -1 && postRecv) nstepsRecv[recvDim]++; - if (sendDim != -1 && postSend) nstepsSend[sendDim]++; - } + struct ncclPatStep ps; + do { + algo.getNextOp(&ps); + if (ps.flags & PatSkipped) continue; + if (ps.recvDim != -1 && ps.postRecv) nstepsRecv[ps.recvDim]++; + if (ps.sendDim != -1 && ps.postSend) nstepsSend[ps.sendDim]++; + } while (ps.last != 2); for (int i=0; inbytes/comm->nRanks; - int last = 0; - int *nstepsSend = NULL, *nstepsRecv = NULL; const int rank = comm->rank, nranks = comm->nRanks; - PatAGAlgorithm algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks); + int *nstepsSend = NULL, *nstepsRecv = NULL; + PatAGAlgorithm algo(op->chunkSize, NCCL_STEPS, 16, 0, size, size, op->chunkSize, rank, nranks); NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_down); NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_down); - while (last == 0) { - int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem; - size_t inpIx, outIx; - algo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last); - if (recvDim != -1 && postRecv) nstepsRecv[recvDim]++; - if (sendDim != -1 && postSend) nstepsSend[sendDim]++; - } + struct ncclPatStep ps; + do { + algo.getNextOp(&ps); + if (ps.flags & PatSkipped) continue; + if (ps.recvDim != -1 && ps.postRecv) nstepsRecv[ps.recvDim]++; + if (ps.sendDim != -1 && ps.postSend) nstepsSend[ps.sendDim]++; + } while (ps.last != 2); for (int i=0; iroot == comm->rank) return ncclSuccess; NCCLCHECK(SaveProxy(comm, channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, 1, justInquire)); } break; + case ncclPatternProfiler: { + if (ncclProfilerNeedsProxy(comm, op)) { + NCCLCHECK(SaveProxyProfiler(comm, op, justInquire)); + } + } break; } return ncclSuccess; } @@ -725,10 +742,10 @@ static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclPr while (op) { if (op->state == ncclProxyOpNone) return ncclInternalError; TIME_START(0); TIME_START(1); - NCCLCHECK(op->progress(proxyState, op)); + ncclResult_t ret = op->progress(proxyState, op); if (op->idle) { TIME_STOP(1); TIME_CANCEL(0); } else { TIME_CANCEL(1); TIME_STOP(0); } *idle &= op->idle; - if (op->state == ncclProxyOpNone) { + if (op->state == ncclProxyOpNone || ret != ncclSuccess) { TIME_START(2); NCCLCHECK(removeOp(state, &op, &prevOp)); TIME_STOP(2); @@ -910,7 +927,7 @@ void* ncclProxyProgress(void *proxyState_) { if (ret != ncclSuccess) { __atomic_store_n(&proxyState->asyncResult, ret, __ATOMIC_RELEASE); INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret); - continue; + break; } void* eHandle; ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle); @@ -932,7 +949,7 @@ void* ncclProxyProgress(void *proxyState_) { } } lastIdle = idle; - } while (state->stop == 0 || (state->stop == 1 && state->active)); + } while ((state->stop == 0 || (state->stop == 1 && state->active)) && __atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) == 0); return NULL; } @@ -1140,6 +1157,7 @@ ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, struct ncclProxyCon } ncclIpcHdr hdr; + memset(&hdr, '\0', sizeof(hdr)); hdr.type = type; hdr.rank = rank; hdr.reqSize = reqSize; @@ -1323,9 +1341,12 @@ static ncclResult_t proxyProgressInit(struct ncclProxyState* proxyState) { pthread_mutexattr_init(&mutexAttr); pthread_mutexattr_setpshared(&mutexAttr, PTHREAD_PROCESS_SHARED); pthread_mutex_init(&pool->mutex, &mutexAttr); + pthread_mutexattr_destroy(&mutexAttr); pthread_condattr_t condAttr; + pthread_condattr_init(&condAttr); pthread_condattr_setpshared(&condAttr, PTHREAD_PROCESS_SHARED); pthread_cond_init(&pool->cond, &condAttr); + pthread_condattr_destroy(&condAttr); state->opsPool = pool; memcpy(state->opsPoolShmSuffix, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1); diff --git a/src/ras/client_support.cc b/src/ras/client_support.cc index 3e4e9a504..3eafe1b79 100644 --- a/src/ras/client_support.cc +++ b/src/ras/client_support.cc @@ -4,8 +4,6 @@ * See LICENSE.txt for license information ************************************************************************/ -#define NDEBUG // Comment out duriyng development only! -#include #include #include @@ -26,26 +24,26 @@ #define STR2(v) #v #define STR(v) STR2(v) -// The RAS client listening socket of this RAS thread (normally port 28028). -int rasClientListeningSocket = -1; - -// Auxiliary structure used when processing the results. Helps with statistics gathering and sorting. +// Generic auxiliary structure used when processing the results. Helps with statistics gathering and sorting, +// e.g., for the calculation of the distribution of the number of peers per node, of the number of GPUs per peer, +// of the communicator sizes, or of the counts of collective operations. struct rasValCount { uint64_t value; // The observed value. int count; // The number of occurences of this value in the results. int firstIdx; // The index of the first occurence of this value in the results. }; -// Used in rasAuxComm below. The values are bitmasks so that they can be combined. +// Communicator status, used in rasAuxComm below. The values are bitmasks so that they can be combined. typedef enum { - RAS_ACS_UNKNOWN = 1, // Set if a peer did not provide info about a given communicator. + RAS_ACS_NOCOMM = 1, // Set if the peer claims not to be a member of a given communicator. RAS_ACS_INIT = 2, RAS_ACS_RUNNING = 4, RAS_ACS_FINALIZE = 8, RAS_ACS_ABORT = 16 } rasACStatus; -// Used in rasAuxComm below. The values are bitmasks so that they can be combined (with the exception of RAS_ACE_OK). +// Communicator errors, used in rasAuxComm below. The values are bitmasks so that they can be combined (with the +// exception of RAS_ACE_OK). typedef enum { RAS_ACE_OK = 0, RAS_ACE_MISMATCH = 1, @@ -53,22 +51,45 @@ typedef enum { RAS_ACE_INCOMPLETE = 4 } rasACError; -// Auxiliary structure used when processing the results. Helps with sorting and includes additional statistics -// on the number of peers and nodes for a communicator. +// Auxiliary structure used when processing the results of the RAS_COLL_COMMS query. For each communicator, caches +// statistics extracted from the results, such as the number of peers and nodes or the communicator status. Includes +// a pointer to the communicator data in the results, making it easy to sort the communicators by a different key +// without altering the results buffer, or just to iterate over the communicators, given that the communicator data +// in the resuls is of variable length. struct rasAuxComm { - struct rasCollComms::comm* comm; + struct rasCollComms::comm* comm; // Points to the results buffer. int nPeers; int nNodes; int ranksPerNodeMin; int ranksPerNodeMax; unsigned int status; // Bitmask of rasACStatus values. unsigned int errors; // Bitmask of rasACError values. - uint64_t firstCollOpCount; // collOpCount of the first rank, to compare against. + uint64_t firstCollOpCounts[NCCL_NUM_FUNCTIONS]; // collOpCounts of the first rank, to compare against. + int nIncompleteRanks; // Number of ranks that we didn't get any response from. }; +// Auxiliary structure used when processing the rasPeerInfo data stored in the global rasPeers array. Makes it possible +// to extract a subset of peers (e.g., the dead ones), to sort by a different key without altering the original array, +// and also has room for extracted temporary data such as the number of peers per node or the number of GPUs per peer. +struct rasAuxPeerInfo { + struct rasPeerInfo* peer; // Points to an element in rasPeers. + int value; +}; + +// Auxiliary structure used when processing the results of the RAS_COLL_COMMS query, specifically when iterating over +// each communicator's ranks. Makes it possible to sort by a different key without altering the original array, and +// also has room for extracted temporary data such as the rank's status or a count of collective operations. +struct rasAuxCommRank { + struct rasCollComms::comm::rank* rank; // Points to the results buffer. + uint64_t value; +}; + +// The RAS client listening socket of this RAS thread (normally port 28028). +int rasClientListeningSocket = -1; + // Connected RAS clients. -struct rasClient* rasClients; -int nRasClients; +struct rasClient* rasClientsHead; +struct rasClient* rasClientsTail; // Minimum byte count to increment the output buffer size by if it's too small. #define RAS_OUT_INCREMENT 4096 @@ -85,6 +106,7 @@ static char lineBuf[1024]; // Temporary buffer used for printing at most 10 (RAS // Still, 1024 should normally be plenty (verbose output may make things more difficult, // but we do check for overflows, so it will just be trimmed). + static ncclResult_t getNewClientEntry(struct rasClient** pClient); static void rasClientEnqueueMsg(struct rasClient* client, char* msg, size_t msgLen); static void rasClientTerminate(struct rasClient* client); @@ -101,15 +123,13 @@ static void rasOutExtract(char* buffer); static int rasOutLength(); static void rasOutReset(); -static int rasPeersNGpuCompare(const void* e1, const void* e2); -static int rasPeersNProcsCompare(const void* e1, const void* e2); -static int rasPeersHostPidCompare(const void* e1, const void* e2); +static int rasAuxPeersValueCompare(const void* e1, const void* e2); static int ncclSocketsHostCompare(const void* p1, const void* p2); static int rasValCountsCompareRev(const void* p1, const void* p2); static int rasAuxCommsCompareRev(const void* p1, const void* p2); -static int rasCommRanksPeerCompare(const void* p1, const void* p2); -static int rasCommRanksCollOpCompare(const void* p1, const void* p2); +static int rasAuxCommRanksValueCompare(const void* p1, const void* p2); +static const char* rasGpuToString(int cudaDev, int nvmlDev, char* buf, size_t size); static const char* rasCommRankGpuToString(const struct rasCollComms::comm::rank* rank, char* buf, size_t size); static const char* ncclErrorToString(ncclResult_t err); static const char* ncclSocketToHost(const union ncclSocketAddress* addr, char* buf, size_t size); @@ -181,21 +201,20 @@ ncclResult_t rasClientAcceptNewSocket() { // Returns the index of the first available entry in the rasClients array, enlarging the array if necessary. static ncclResult_t getNewClientEntry(struct rasClient** pClient) { struct rasClient* client; - int i; - for (i = 0; i < nRasClients; i++) - if (rasClients[i].status == RAS_CLIENT_CLOSED) - break; - if (i == nRasClients) { - NCCLCHECK(ncclRealloc(&rasClients, nRasClients, nRasClients+RAS_INCREMENT)); - nRasClients += RAS_INCREMENT; - } - client = rasClients+i; - memset(client, '\0', sizeof(*client)); + NCCLCHECK(ncclCalloc(&client, 1)); + client->sock = client->pfd = -1; ncclIntruQueueConstruct(&client->sendQ); client->timeout = RAS_COLLECTIVE_LEG_TIMEOUT; - client->collIdx = -1; + + if (rasClientsHead) { + rasClientsTail->next = client; + client->prev = rasClientsTail; + rasClientsTail = client; + } else { + rasClientsHead = rasClientsTail = client; + } *pClient = client; return ncclSuccess; @@ -219,22 +238,32 @@ static void rasClientEnqueueMsg(struct rasClient* client, char* msg, size_t msgL struct rasMsgMeta* meta = (struct rasMsgMeta*)((char*)msg - offsetof(struct rasMsgMeta, msg)); meta->offset = 0; meta->length = msgLen; - ncclIntruQueueEnqueue(&client->sendQ, meta); - assert(client->status != RAS_CLIENT_CLOSED && client->status < RAS_CLIENT_FINISHED); - rasPfds[client->pfd].events |= POLLOUT; + if (client->status != RAS_CLIENT_CLOSED && client->status < RAS_CLIENT_FINISHED) { + ncclIntruQueueEnqueue(&client->sendQ, meta); + rasPfds[client->pfd].events |= POLLOUT; + } else { + INFO(NCCL_RAS, "RAS invalid client status %d -- internal error?", client->status); + } } // Terminates a connection with a RAS client. static void rasClientTerminate(struct rasClient* client) { (void)close(client->sock); - client->sock = -1; - client->status = RAS_CLIENT_CLOSED; rasPfds[client->pfd].fd = -1; rasPfds[client->pfd].events = rasPfds[client->pfd].revents = 0; - client->pfd = -1; while (struct rasMsgMeta* meta = ncclIntruQueueTryDequeue(&client->sendQ)) { free(meta); } + + if (client == rasClientsHead) + rasClientsHead = rasClientsHead->next; + if (client == rasClientsTail) + rasClientsTail = rasClientsTail->prev; + if (client->prev) + client->prev->next = client->next; + if (client->next) + client->next->prev = client->prev; + free(client); } @@ -245,16 +274,12 @@ static void rasClientTerminate(struct rasClient* client) { // Invoked when an asynchronous operation that a client was waiting on completes. Finds the right client and // reinvokes rasClientRun. ncclResult_t rasClientResume(struct rasCollective* coll) { - int collIdx = coll-rasCollectives; - int i; - struct rasClient* client = nullptr; - for (i = 0; i < nRasClients; i++) { - client = rasClients+i; - if (client->status != RAS_CLIENT_CLOSED && client->collIdx == collIdx) { + struct rasClient* client; + + for (client = rasClientsHead; client; client = client->next) + if (client->coll == coll) break; - } - } - if (i == nRasClients) { + if (client == nullptr) { INFO(NCCL_RAS, "RAS failed to find a matching client!"); rasCollFree(coll); goto exit; @@ -266,8 +291,7 @@ ncclResult_t rasClientResume(struct rasCollective* coll) { } // Handles a ready client FD from the main event loop. -void rasClientEventLoop(int clientIdx, int pollIdx) { - struct rasClient* client = rasClients+clientIdx; +void rasClientEventLoop(struct rasClient* client, int pollIdx) { bool closed = false; if (client->status == RAS_CLIENT_CONNECTED) { @@ -431,7 +455,6 @@ static ncclResult_t rasClientRun(struct rasClient* client) { break; } case RAS_CLIENT_CONNS: - assert(client->collIdx != -1); NCCLCHECKGOTO(rasClientRunConns(client), ret, exit); #endif client->status = RAS_CLIENT_COMMS; @@ -440,7 +463,6 @@ static ncclResult_t rasClientRun(struct rasClient* client) { break; } case RAS_CLIENT_COMMS: - assert(client->collIdx != -1); NCCLCHECKGOTO(rasClientRunComms(client), ret, exit); client->status = RAS_CLIENT_FINISHED; break; @@ -459,7 +481,7 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { ncclResult_t ret = ncclSuccess; char* msg = nullptr; int msgLen; - struct rasPeerInfo* peersReSorted = nullptr; + struct rasAuxPeerInfo* auxRasPeers = nullptr; int totalGpus, totalNodes, firstNGpusNode, firstNGpusGlobal, firstNPeersGlobal; bool consistentNGpusNode, consistentNGpusGlobal, consistentNPeersGlobal; int firstIdx, nPeers; @@ -467,6 +489,8 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { int nValCounts; static int cudaDriver = -1, cudaRuntime = -1; + TRACE(NCCL_RAS, "RAS: rasClientRunInit: starting"); + rasOutReset(); rasOutAppend("NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX " compiled with CUDA " STR(CUDA_MAJOR) "." STR(CUDA_MINOR) "\n"); @@ -481,7 +505,6 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { rasClientEnqueueMsg(client, msg, msgLen); msg = nullptr; - rasOutReset(); totalGpus = totalNodes = 0; firstNGpusNode = 0; // #GPUs on the first peer of a node. firstNGpusGlobal = 0; // #GPUs on peerIdx 0. @@ -489,7 +512,7 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { consistentNGpusGlobal = true; // Whether #GPUs/peer is consistent between the peers *on all nodes*. consistentNPeersGlobal = true; // Whether #peers/node is consistent between all nodes. nPeers = 0; // #peers on a node. - firstNPeersGlobal = 0; + firstNPeersGlobal = 0; // #peers on the first node. for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) { int nGpus = __builtin_popcountll(rasPeers[peerIdx].cudaDevs); totalGpus += nGpus; @@ -522,6 +545,11 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { } } // for (peerIdx) + TRACE(NCCL_RAS, "RAS: totalNodes %d, nRasPeers %d, totalGpus %d", totalNodes, nRasPeers, totalGpus); + TRACE(NCCL_RAS, "RAS: consistentNPeersGlobal %d, consistentNGpusGlobal %d, consistentNGpusNode %d", + consistentNPeersGlobal, consistentNGpusGlobal, consistentNGpusNode); + TRACE(NCCL_RAS, "RAS: firstNPeersGlobal %d, firstNGpusGlobal %d", firstNPeersGlobal, firstNGpusGlobal); + rasOutAppend("Job summary\n" "===========\n\n"); @@ -532,22 +560,24 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { totalNodes, firstNPeersGlobal, firstNGpusGlobal, nRasPeers, totalGpus); } else { // Gather the stats on the number of processes per node. However, that number is not a property of a peer, - // but of a group of peers, so calculating it is more involved. We make a copy of rasPeers and creatively - // misuse it: cudaDevs of each element will be repurposed to store the number of processes on the node. - NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasPeers), ret, fail); - memcpy(peersReSorted, rasPeers, nRasPeers * sizeof(*peersReSorted)); + // but of a group of peers, so calculating it is more involved. We store the value in a temporary auxRasPeers + // array. + NCCLCHECKGOTO(ncclCalloc(&auxRasPeers, nRasPeers), ret, fail); firstIdx = 0; nPeers = 0; for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) { + auxRasPeers[peerIdx].peer = rasPeers+peerIdx; if (peerIdx == 0) { nPeers = 1; firstIdx = 0; } else { // peerIdx > 0 - if (!ncclSocketsSameNode(&peersReSorted[peerIdx].addr, &peersReSorted[peerIdx-1].addr)) { + if (!ncclSocketsSameNode(&auxRasPeers[peerIdx].peer->addr, &auxRasPeers[peerIdx-1].peer->addr)) { + TRACE(NCCL_RAS, "RAS: node %s: nPeers %d", + ncclSocketToHost(&auxRasPeers[peerIdx].peer->addr, rasLine, sizeof(rasLine)), nPeers); for (int i = firstIdx; i < peerIdx; i++) { // Go back and update the number of processes of all the elements of that node. - peersReSorted[i].cudaDevs = nPeers; + auxRasPeers[i].value = nPeers; } nPeers = 1; firstIdx = peerIdx; @@ -557,21 +587,23 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { } // peerIdx > 0 if (peerIdx == nRasPeers-1) { // Last iteration of the loop. + TRACE(NCCL_RAS, "RAS: node %s: nPeers %d", + ncclSocketToHost(&auxRasPeers[peerIdx].peer->addr, rasLine, sizeof(rasLine)), nPeers); for (int i = firstIdx; i < nRasPeers; i++) { - peersReSorted[i].cudaDevs = nPeers; + auxRasPeers[i].value = nPeers; } } } // for (peerIdx) - // Re-sort it now using the number of processes on the node (cudaDevs) as the primary key, host IP as the + // Re-sort it now using the number of processes on the node (value) as the primary key, host IP as the // secondary, and process id as the tertiary. - qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), rasPeersNProcsCompare); + qsort(auxRasPeers, nRasPeers, sizeof(*auxRasPeers), rasAuxPeersValueCompare); // Calculate the distribution of different numbers of peers per node. nValCounts = 0; for (int peerIdx = 0; peerIdx < nRasPeers;) { - if (peerIdx == 0 || peersReSorted[peerIdx].cudaDevs != peersReSorted[peerIdx-1].cudaDevs) { - valCounts[nValCounts].value = peersReSorted[peerIdx].cudaDevs; + if (peerIdx == 0 || auxRasPeers[peerIdx].value != auxRasPeers[peerIdx-1].value) { + valCounts[nValCounts].value = auxRasPeers[peerIdx].value; valCounts[nValCounts].count = 1; valCounts[nValCounts].firstIdx = peerIdx; nValCounts++; @@ -579,14 +611,15 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { valCounts[nValCounts-1].count++; } // Advance peerIdx to the next node. - peerIdx += peersReSorted[peerIdx].cudaDevs; - } + peerIdx += auxRasPeers[peerIdx].value; + } // for (peerIdx) // valCounts is currently sorted by value (the number of peers per node). Sort it by the count (most frequent // number of peers first). qsort(valCounts, nValCounts, sizeof(*valCounts), rasValCountsCompareRev); // Print it out, the most frequent peer counts first. if (consistentNGpusNode && consistentNGpusGlobal) { + // consistentNPeersGlobal must be false rasOutAppend(" Nodes Processes GPUs\n" " per node per process\n"); for (int i = 0; i < nValCounts; i++) { @@ -594,7 +627,7 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { rasOutAppend("%7d %9ld %11d\n", vc->count, vc->value, firstNGpusGlobal); } - } else { + } else { // !consistentNGpusNode || !consistentNGpusGlobal rasOutAppend(" Nodes Processes\n" " per node\n"); for (int i = 0; i < nValCounts; i++) { @@ -606,24 +639,29 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { // We calculate and print the GPUs/process separately. This is required for !consistentNGpusNode and // it also makes our life easier above for !consistentNGpusGlobal (which could require a larger valCounts). - // Sort peers by the GPU count, to simplify data extraction. - memcpy(peersReSorted, rasPeers, nRasPeers * sizeof(*peersReSorted)); + // Sort peers by the GPU count, to simplify data extraction. Not sure how fast __builtin_popcountll is so we + // may just as well cache it... + for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) { + auxRasPeers[peerIdx].value = __builtin_popcountll(auxRasPeers[peerIdx].peer->cudaDevs); + TRACE(NCCL_RAS, "RAS: node %s pid %d: nGpus %d", + ncclSocketToHost(&auxRasPeers[peerIdx].peer->addr, rasLine, sizeof(rasLine)), + auxRasPeers[peerIdx].peer->pid, auxRasPeers[peerIdx].value); + } // GPU count is the primary key, host IP is the secondary, and process id is the tertiary. - qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), rasPeersNGpuCompare); + qsort(auxRasPeers, nRasPeers, sizeof(*auxRasPeers), rasAuxPeersValueCompare); // Calculate the distribution of different numbers of GPUs per peer. nValCounts = 0; for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) { - if (peerIdx == 0 || __builtin_popcountll(peersReSorted[peerIdx].cudaDevs) != - __builtin_popcountll(peersReSorted[peerIdx-1].cudaDevs)) { - valCounts[nValCounts].value = __builtin_popcountll(peersReSorted[peerIdx].cudaDevs); + if (peerIdx == 0 || auxRasPeers[peerIdx].value != auxRasPeers[peerIdx-1].value) { + valCounts[nValCounts].value = auxRasPeers[peerIdx].value; valCounts[nValCounts].count = 1; valCounts[nValCounts].firstIdx = peerIdx; nValCounts++; } else { valCounts[nValCounts-1].count++; } - } + } // for (peerIdx) // valCounts is currently sorted by value (number of GPUs per peer). Sort it by the count (most frequent // GPU counts first). qsort(valCounts, nValCounts, sizeof(*valCounts), rasValCountsCompareRev); @@ -637,7 +675,7 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { rasOutAppend(" %9d %11ld\n", vc->count, vc->value); } - } + } // !consistentNGpusNode || !consistentNGpusGlobal rasOutAppend("\n" " Nodes Processes GPUs\n" "(total) (total) (total)\n" @@ -652,16 +690,16 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { // provided that they meet our definition of an outlier. if (rasCountIsOutlier(vc->count, client->verbose, totalNodes)) { rasOutAppend("\nThe outlier node%s:\n", (vc->count > 1 ? "s" : "")); - // peersReSorted is sorted by the node IP address (not port!) as the secondary key and the pid as + // auxRasPeers is sorted by the node IP address (not port!) as the secondary key and the pid as // the tertiary, which comes in handy when printing... for (int peerIdx = vc->firstIdx; peerIdx < vc->count*vc->value + vc->firstIdx; peerIdx += vc->value) { lineBuf[0] = '\0'; for (int j = 0; j < vc->value; j++) { snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d", - (j > 0 ? "," : ""), peersReSorted[j].pid); + (j > 0 ? "," : ""), auxRasPeers[j].peer->pid); } rasOutAppend(" Node %s running process%s %s\n", - ncclSocketToHost(&peersReSorted[peerIdx].addr, rasLine, sizeof(rasLine)), + ncclSocketToHost(&auxRasPeers[peerIdx].peer->addr, rasLine, sizeof(rasLine)), (vc->value > 1 ? "es" : ""), lineBuf); } // for (peerIdx) } // if (rasCountIsOutlier(vc->count)) @@ -678,13 +716,12 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { rasClientEnqueueMsg(client, msg, msgLen); msg = nullptr; { - struct rasCollRequest collReq; + struct rasCollRequest collReq = {}; bool allDone = false; rasCollReqInit(&collReq); collReq.timeout = client->timeout; collReq.type = RAS_COLL_CONNS; - NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_CONNS), &allDone, &client->collIdx), - ret, fail); + NCCLCHECKGOTO(rasNetSendCollReq(&collReq, &allDone, &client->coll), ret, fail); if (!allDone) ret = ncclInProgress; // We need to wait for async. responses. } @@ -696,18 +733,18 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) { rasClientEnqueueMsg(client, msg, msgLen); msg = nullptr; { - struct rasCollRequest collReq; + struct rasCollRequest collReq = {}; bool allDone = false; rasCollReqInit(&collReq); collReq.timeout = client->timeout; collReq.type = RAS_COLL_COMMS; - NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_COMMS), &allDone, &client->collIdx), - ret, fail); + NCCLCHECKGOTO(rasNetSendCollReq(&collReq, &allDone, &client->coll), ret, fail); if (!allDone) ret = ncclInProgress; } + TRACE(NCCL_RAS, "RAS: rasClientRunInit: scheduling RAS_COLL_COMMS and finishing"); exit: - free(peersReSorted); + free(auxRasPeers); return ret; fail: goto exit; @@ -721,13 +758,16 @@ static ncclResult_t rasClientRunConns(struct rasClient* client) { ncclResult_t ret = ncclSuccess; char* msg = nullptr; int msgLen; - struct rasCollective* coll = rasCollectives+client->collIdx; + struct rasCollective* coll = client->coll; struct rasCollConns* connsData = (struct rasCollConns*)coll->data; int expected; struct rasPeerInfo* peersBuf = nullptr; - assert(coll->nFwdSent == coll->nFwdRecv); - client->collIdx = -1; + if (coll == nullptr || coll->nFwdSent != coll->nFwdRecv) { + INFO(NCCL_RAS, "RAS invalid collective operation status; client status %d -- internal error?", client->status); + return ncclInternalError; + } + client->coll = nullptr; rasOutReset(); rasOutAppend(" obtained a result in %.2fs\n", (clockNano()-coll->startTime)/1e9); @@ -822,13 +862,12 @@ static ncclResult_t rasClientRunConns(struct rasClient* client) { rasClientEnqueueMsg(client, msg, msgLen); msg = nullptr; { - struct rasCollRequest collReq; + struct rasCollRequest collReq = {}; bool allDone = false; rasCollReqInit(&collReq); collReq.timeout = client->timeout; collReq.type = RAS_COLL_COMMS; - NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_COMMS), &allDone, &client->collIdx), - ret, fail); + NCCLCHECKGOTO(rasNetSendCollReq(&collReq, &allDone, &client->coll), ret, fail); if (!allDone) ret = ncclInProgress; } @@ -847,10 +886,10 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { ncclResult_t ret = ncclSuccess; char* msg = nullptr; int msgLen; - struct rasCollective* coll = rasCollectives+client->collIdx; + struct rasCollective* coll = client->coll; struct rasCollComms* commsData = (struct rasCollComms*)coll->data; struct rasCollComms::comm* comm; - struct rasCollComms::comm::rank* ranksReSorted = nullptr; + struct rasAuxCommRank* auxCommRanks = nullptr; struct rasValCount* valCounts = nullptr; int nValCounts; struct rasValCount* collOpCounts = nullptr; @@ -860,7 +899,7 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { int vcIdx; int nPeersMissing; uint64_t* peerNvmlDevs = nullptr; - const char*const statusStr[] = { "UNKNOWN", "INIT", "RUNNING", "FINALIZE", "ABORT" }; + const char*const statusStr[] = { "NOCOMM", "INIT", "RUNNING", "FINALIZE", "ABORT" }; const char*const errorStr[] = { // Listing them all like this, while a bit of a hassle, is less effort than formatting in a temporary buffer. "OK", @@ -873,14 +912,22 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { "INCOMPLETE,ERROR,MISMATCH" }; - assert(coll->nFwdSent == coll->nFwdRecv); - client->collIdx = -1; + TRACE(NCCL_RAS, "RAS: rasClientRunComms: starting"); + TRACE(NCCL_RAS, "RAS: coll nLegTimeouts %d, nPeers %d, nData %d; commsData nComms %d", + coll->nLegTimeouts, coll->nPeers, coll->nData, commsData->nComms); + + if (coll == nullptr || coll->nFwdSent != coll->nFwdRecv) { + INFO(NCCL_RAS, "RAS invalid collective operation status; client status %d -- internal error?", client->status); + return ncclInternalError; + } + client->coll = nullptr; rasOutReset(); rasOutAppend(" (%.2fs)\n=============\n\n", (clockNano()-coll->startTime)/1e9); // Calculate the number of missing peers early as we rely on it for other things. nPeersMissing = nRasPeers - nRasDeadPeers - coll->nPeers; + TRACE(NCCL_RAS, "RAS: nRasPeers %d, nRasDeadPeers %d, nPeersMissing %d", nRasPeers, nRasDeadPeers, nPeersMissing); // Sort the communicators by size. As the structure is inconvenient to move around due to the elements being // of variable length, we create an auxiliary array that includes pointers to individual elements and simply sort @@ -896,12 +943,15 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { auxComms[commIdx].comm = comm; comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks)); } - NCCLCHECKGOTO(ncclCalloc(&ranksReSorted, maxCommSize), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&auxCommRanks, maxCommSize), ret, fail); + TRACE(NCCL_RAS, "RAS: maxCommSize %d", maxCommSize); // For convenience, create a translation table from rasCollective's peerIdx to rasPeers peerIdx. NCCLCHECKGOTO(ncclCalloc(&peerIdxConv, coll->nPeers), ret, fail); - for (int peerIdx = 0; peerIdx < coll->nPeers; peerIdx++) + for (int peerIdx = 0; peerIdx < coll->nPeers; peerIdx++) { peerIdxConv[peerIdx] = rasPeerFind(coll->peers+peerIdx); + TRACE(NCCL_RAS, "RAS: coll peers[%d] -> rasPeers[%d]", peerIdx, peerIdxConv[peerIdx]); + } // Sort coll->peers to match the ordering of rasPeers -- we may need it later... qsort(coll->peers, coll->nPeers, sizeof(*coll->peers), &ncclSocketsCompare); @@ -910,42 +960,75 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { struct rasAuxComm* auxComm = auxComms+commIdx; int nRanks = 0; comm = auxComm->comm; + TRACE(NCCL_RAS, "RAS: coll comms[%d]: commId (0x%lx, 0x%lx, 0x%lx), commNRanks %d, nRanks %d, nMissingRanks %d", + commIdx, comm->commId.commHash, comm->commId.hostHash, comm->commId.pidHash, + comm->commNRanks, comm->nRanks, comm->nMissingRanks); - if (comm->commNRanks > comm->nRanks) { + if (comm->nMissingRanks > 0) { // There are two possibilities here. Either we are missing the data on some ranks because the processes are // unreachable, or the processes _are_ reachable but didn't report to be part of this communicator (which - // could definitely happen if some processes have already called ncclCommDestroy or ncclCommAbort). Because we - // currently don't collect data about missing ranks, we can't reliably distinguish these two cases. - // For now we rely on an approximation: if we _know_ that some peers failed to respond, we mark this - // as an INCOMPLETE error; otherwise as a MISMATCH warning. - if (nPeersMissing > 0 || nRasDeadPeers > 0) - auxComm->errors |= RAS_ACE_INCOMPLETE; - else { + // could definitely happen if some processes have already called ncclCommDestroy or ncclCommAbort). + if (nPeersMissing == 0 && nRasDeadPeers == 0) { + // We received data from _all_ processes. That's an easy case. auxComm->errors |= RAS_ACE_MISMATCH; - auxComm->status |= RAS_ACS_UNKNOWN; - } + auxComm->status |= RAS_ACS_NOCOMM; + } else { + // We failed to receive data from some processes but we don't know if that's why we don't have the info about + // some ranks of this communicator. We need to check all the missing ranks one-by-one as different ranks may + // have different reason. + struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks+comm->nRanks); + + for (int rankIdx = 0; rankIdx < comm->nMissingRanks; rankIdx++) { + struct rasCollCommsMissingRank* missingRank = missingRanks + rankIdx; + void* found; + if ((found = bsearch(&missingRank->addr, coll->peers, coll->nPeers, sizeof(*coll->peers), + ncclSocketsCompare)) != nullptr) { + // We did receive the data from that process, but not about this communicator. + auxComm->errors |= RAS_ACE_MISMATCH; + auxComm->status |= RAS_ACS_NOCOMM; + } else { + // We failed to receive data from that process. + auxComm->errors |= RAS_ACE_INCOMPLETE; + auxComm->nIncompleteRanks++; + } + TRACE(NCCL_RAS, "RAS: comm missingRank[%d] commRank %d, addr %td (-> %d), cudaDev %d, nvmlDev %d", + rankIdx, missingRank->commRank, (found ? ((union ncclSocketAddress*)found) - coll->peers: -1), + rasPeerFind(&missingRank->addr), missingRank->cudaDev, missingRank->nvmlDev); + } // for (rankIdx) + } // nPeersMissing > 0 || nRasDeadPeers > 0 + } // if (comm->nMissingRanks > 0) + + // Initialize auxCommRanks from comm->rank, converting peerIdx to rasPeers, then sort by it -- that way we will + // have the ranks sorted by node and process, which makes counting easy. + for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { + struct rasCollComms::comm::rank* rank = comm->ranks+rankIdx; + auxCommRanks[rankIdx].rank = rank; + auxCommRanks[rankIdx].value = peerIdxConv[rank->peerIdx]; + TRACE(NCCL_RAS, "RAS: comm rank[%d] commRank %d, peerIdx %d (-> %d), cudaDev %d, nvmlDev %d", + rankIdx, rank->commRank, rank->peerIdx, peerIdxConv[rank->peerIdx], rank->cudaDev, rank->nvmlDev); + TRACE(NCCL_RAS, "RAS: comm rank[%d] collOpCounts (%ld, %ld, %ld, %ld, %ld)", + rankIdx, rank->collOpCounts[0], rank->collOpCounts[1], rank->collOpCounts[2], rank->collOpCounts[3], + rank->collOpCounts[4]); + TRACE(NCCL_RAS, "RAS: comm rank[%d] status initState %d, asyncError %d, finalizeCalled %d, destroyFlag %d, " + "abortFlag %d", rankIdx, rank->status.initState, rank->status.asyncError, rank->status.finalizeCalled, + rank->status.destroyFlag, rank->status.abortFlag); /**/ } - - memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted)); - // Convert ranksReSorted' peerIdx to rasPeers and sort by it -- that way we will have the ranks sorted - // by process _and_ node, which makes counting easy. - for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) - ranksReSorted[rankIdx].peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx]; - qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksPeerCompare); + // This also sorts by the commRank, which we don't care about here, but it won't hurt. + qsort(auxCommRanks, comm->nRanks, sizeof(*auxCommRanks), rasAuxCommRanksValueCompare); // Count the peers and nodes, get the status/error indicators. for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { - struct rasCollComms::comm::rank* rank = ranksReSorted+rankIdx; + struct rasAuxCommRank* auxRank = auxCommRanks+rankIdx; if (rankIdx == 0) { auxComm->nPeers = auxComm->nNodes = 1; auxComm->ranksPerNodeMin = NCCL_MAX_LOCAL_RANKS; auxComm->ranksPerNodeMax = 0; - auxComm->firstCollOpCount = rank->collOpCount; + memcpy(auxComm->firstCollOpCounts, auxRank->rank->collOpCounts, sizeof(auxComm->firstCollOpCounts)); nRanks = 1; } else { // rankIdx > 0 - if (rank->peerIdx != rank[-1].peerIdx) { + if (auxRank->value != auxRank[-1].value) { auxComm->nPeers++; - if (!ncclSocketsSameNode(&rasPeers[rank->peerIdx].addr, &rasPeers[rank[-1].peerIdx].addr)) { + if (!ncclSocketsSameNode(&rasPeers[auxRank->value].addr, &rasPeers[auxRank[-1].value].addr)) { auxComm->nNodes++; if (auxComm->ranksPerNodeMin > nRanks) auxComm->ranksPerNodeMin = nRanks; @@ -953,7 +1036,7 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { auxComm->ranksPerNodeMax = nRanks; nRanks = 0; } - } // if (rank->peerIdx != rank[-1].peerIdx) + } // if (auxRank->value != auxRank[-1].value) nRanks++; } // rankIdx > 0 if (rankIdx == comm->nRanks-1) { @@ -964,25 +1047,27 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { auxComm->ranksPerNodeMax = nRanks; } - if (rank->status.abortFlag) + if (auxRank->rank->status.abortFlag) auxComm->status |= RAS_ACS_ABORT; - else if (rank->status.finalizeCalled || rank->status.destroyFlag) { + else if (auxRank->rank->status.finalizeCalled || auxRank->rank->status.destroyFlag) { // destroyFlag is set by ncclCommDestroy and ncclCommAbort. finalizeCalled appears to be set by // ncclCommFinalize only. According to the docs, ncclCommDestroy *can* be called without calling // ncclCommFinalize first. The code structure here ensures that we attribute destroyFlag properly // as a finalize state indicator (and ignore it in case of ncclCommAbort). auxComm->status |= RAS_ACS_FINALIZE; } - else if (rank->status.initState == ncclSuccess) + else if (auxRank->rank->status.initState == ncclSuccess) auxComm->status |= RAS_ACS_RUNNING; - else // rank->initState != ncclSuccess + else // auxRank->rank->initState != ncclSuccess auxComm->status |= RAS_ACS_INIT; - if (rank->collOpCount != auxComm->firstCollOpCount) - auxComm->errors |= RAS_ACE_MISMATCH; - if (rank->status.initState != ncclSuccess && rank->status.initState != ncclInProgress) + for (int collIdx = 0; collIdx < NCCL_NUM_FUNCTIONS && !(auxComm->errors & RAS_ACE_MISMATCH); collIdx++) { + if (auxRank->rank->collOpCounts[collIdx] != auxComm->firstCollOpCounts[collIdx]) + auxComm->errors |= RAS_ACE_MISMATCH; + } + if (auxRank->rank->status.initState != ncclSuccess && auxRank->rank->status.initState != ncclInProgress) auxComm->errors |= RAS_ACE_ERROR; - if (rank->status.asyncError != ncclSuccess && rank->status.asyncError != ncclInProgress) + if (auxRank->rank->status.asyncError != ncclSuccess && auxRank->rank->status.asyncError != ncclInProgress) auxComm->errors |= RAS_ACE_ERROR; } // for (rankIdx) @@ -990,9 +1075,14 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { // We've got a status mismatch between ranks. auxComm->errors |= RAS_ACE_MISMATCH; } + TRACE(NCCL_RAS, "RAS: auxComm nPeers %d, nNodes %d, nIncompleteRanks %d", + auxComm->nPeers, auxComm->nNodes, auxComm->nIncompleteRanks); + TRACE(NCCL_RAS, "RAS: auxComm ranksPerNodeMin %d, ranksPerNodeMax %d, status 0x%x, errors 0x%x", + auxComm->ranksPerNodeMin, auxComm->ranksPerNodeMax, auxComm->status, auxComm->errors); } // for (commIdx) // Sort it by size/nNodes/status/errors/missing ranks. - qsort(auxComms, commsData->nComms, sizeof(*auxComms), &rasAuxCommsCompareRev); + if (auxComms) + qsort(auxComms, commsData->nComms, sizeof(*auxComms), &rasAuxCommsCompareRev); // Calculate the distribution of different communicator sizes. NCCLCHECKGOTO(ncclCalloc(&valCounts, commsData->nComms), ret, fail); @@ -1014,10 +1104,14 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { } } - rasOutAppend("Group Comms Nodes Ranks Ranks Ranks Status Errors\n" - " # in group per comm per node per comm in group\n"); - if (commsData->nComms == 0) + TRACE(NCCL_RAS, "RAS: rasClientRunComms: done with initial data processing"); + + if (commsData->nComms > 0) { + rasOutAppend("Group Comms Nodes Ranks Ranks Ranks Status Errors\n" + " # in group per comm per node per comm in group\n"); + } else { rasOutAppend("No communicator data collected!\n"); + } // Allocate an auxiliary structure used for counting the number of ranks (unique GPUs) in a group. NCCLCHECKGOTO(ncclCalloc(&peerNvmlDevs, coll->nPeers), ret, fail); @@ -1058,6 +1152,11 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { // status (which is a bitmask) into an array index. statusStr[(sizeof(unsigned int)*8-1)-__builtin_clz(auxComm->status)], errorStr[auxComm->errors]); } + msgLen = rasOutLength(); + NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail); + rasOutExtract(msg); + rasClientEnqueueMsg(client, msg, msgLen); + msg = nullptr; rasOutAppend("\nErrors\n" "======\n\n"); @@ -1068,12 +1167,12 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { if (rasCountIsOutlier(nPeersMissing, client->verbose)) { // Extract a list of missing peers. We don't want to print it right away because it would be sorted // by address (including port, which isn't meaningful to end users). - struct rasPeerInfo* peersBuf = nullptr; + struct rasAuxPeerInfo* auxPeersBuf = nullptr; int nPeersBuf; // Both rasPeers and coll->peers are sorted by address (the latter we sorted above) which makes comparing // them much easier. - NCCLCHECKGOTO(ncclCalloc(&peersBuf, nPeersMissing), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&auxPeersBuf, nPeersMissing), ret, fail); nPeersBuf = 0; for (int rasPeerIdx = 0, collPeerIdx = 0; rasPeerIdx < nRasPeers || collPeerIdx < coll->nPeers;) { int cmp; @@ -1088,30 +1187,42 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { } else if (cmp < 0) { // Process missing from coll->peers. Don't report dead ones though, as they are not included // in nPeersMissing and are reported separately below. - if (!rasPeerIsDead(&rasPeers[rasPeerIdx].addr)) { - assert(nPeersBuf < nPeersMissing); - memcpy(peersBuf+(nPeersBuf++), rasPeers+rasPeerIdx, sizeof(*peersBuf)); + bool dead; + if (!(dead = rasPeerIsDead(&rasPeers[rasPeerIdx].addr))) { + if (nPeersBuf < nPeersMissing) { + auxPeersBuf[nPeersBuf++].peer = rasPeers+rasPeerIdx; + } else { + INFO(NCCL_RAS, "RAS overflow of auxPeersBuf: nPeersBuf %d, rasPeerIdx %d (%s), collPeerIdx %d -- " + "internal error?", + nPeersBuf, rasPeerIdx, ncclSocketToString(&rasPeers[rasPeerIdx].addr, rasLine), collPeerIdx); + } } + TRACE(NCCL_RAS, "RAS rasPeerIdx %d (%s) is missing from coll->peers; dead %d", + rasPeerIdx, ncclSocketToString(&rasPeers[rasPeerIdx].addr, rasLine), dead); rasPeerIdx++; } else { // cmp > 0 // Process not found in rasPeers -- shouldn't happen, unless during a race? + INFO(NCCL_RAS, "RAS failed to find coll->peer[%d] (%s) in rasPeers -- internal error?", + collPeerIdx, ncclSocketToString(coll->peers+collPeerIdx, rasLine)); collPeerIdx++; } // cmp > 0 } // for (rasPeerIdx, collPeerIdx) - // Sort the output by host and pid. - qsort(peersBuf, nPeersBuf, sizeof(*peersBuf), rasPeersHostPidCompare); + // Sort the output by host and pid. rasAuxPeersValueCompare uses value as the primary key, which is 0 for + // all auxPeersBuf elements here, so it will do. + qsort(auxPeersBuf, nPeersBuf, sizeof(*auxPeersBuf), rasAuxPeersValueCompare); for (int peerIdx = 0; peerIdx < nPeersBuf; peerIdx++) { - rasOutAppend(" Process %d on node %s managing GPU%s %s\n", peersBuf[peerIdx].pid, - ncclSocketToHost(&peersBuf[peerIdx].addr, rasLine, sizeof(rasLine)), - (__builtin_popcountll(peersBuf[peerIdx].cudaDevs) > 1 ? "s" : ""), - rasGpuDevsToString(peersBuf[peerIdx].cudaDevs, peersBuf[peerIdx].nvmlDevs, lineBuf, + struct rasAuxPeerInfo* auxPeer = auxPeersBuf+peerIdx; + rasOutAppend(" Process %d on node %s managing GPU%s %s\n", auxPeer->peer->pid, + ncclSocketToHost(&auxPeer->peer->addr, rasLine, sizeof(rasLine)), + (__builtin_popcountll(auxPeer->peer->cudaDevs) > 1 ? "s" : ""), + rasGpuDevsToString(auxPeer->peer->cudaDevs, auxPeer->peer->nvmlDevs, lineBuf, sizeof(lineBuf))); } if (nPeersBuf != nPeersMissing) rasOutAppend(" [could not find information on %d process%s]\n", nPeersMissing-nPeersBuf, (nPeersMissing-nPeersBuf > 1 ? "es" : "")); - free(peersBuf); + free(auxPeersBuf); } // if (rasCountIsOutlier(nPeersMissing)) rasOutAppend("\n"); } @@ -1121,31 +1232,35 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { " %d job process%s considered dead (unreachable via the RAS network)\n", nRasDeadPeers, (nRasDeadPeers > 1 ? "es are" : " is")); if (rasCountIsOutlier(nRasDeadPeers, client->verbose)) { - struct rasPeerInfo* peersReSorted = nullptr; - int nPeersReSorted = 0; - NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasDeadPeers), ret, fail); + // rasDeadPeers contains only addresses, whereas we want a complete rasPeerInfo, and sorted differently. + struct rasAuxPeerInfo* auxPeersBuf = nullptr; + int nPeersBuf = 0; + NCCLCHECKGOTO(ncclCalloc(&auxPeersBuf, nRasDeadPeers), ret, fail); for (int i = 0; i < nRasDeadPeers; i++) { int peerIdx = rasPeerFind(rasDeadPeers+i); if (peerIdx != -1) - memcpy(peersReSorted+(nPeersReSorted++), rasPeers+peerIdx, sizeof(*peersReSorted)); + auxPeersBuf[nPeersBuf++].peer = rasPeers+peerIdx; } - // Sort the output by host and pid, not host and port. - qsort(peersReSorted, nPeersReSorted, sizeof(*peersReSorted), rasPeersHostPidCompare); - for (int peerIdx = 0; peerIdx < nPeersReSorted; peerIdx++) { - rasOutAppend(" Process %d on node %s managing GPU%s %s\n", peersReSorted[peerIdx].pid, - ncclSocketToHost(&peersReSorted[peerIdx].addr, rasLine, sizeof(rasLine)), - (__builtin_popcountll(peersReSorted[peerIdx].cudaDevs) > 1 ? "s" : ""), - rasGpuDevsToString(peersReSorted[peerIdx].cudaDevs, peersReSorted[peerIdx].nvmlDevs, lineBuf, + // Sort the output by host and pid, not host and port. rasAuxPeersValueCompare uses value as the primary key, + // which is 0 for all auxPeersBuf elements here, so it will do. + qsort(auxPeersBuf, nPeersBuf, sizeof(*auxPeersBuf), rasAuxPeersValueCompare); + for (int peerIdx = 0; peerIdx < nPeersBuf; peerIdx++) { + struct rasAuxPeerInfo* auxPeer = auxPeersBuf+peerIdx; + rasOutAppend(" Process %d on node %s managing GPU%s %s\n", auxPeer->peer->pid, + ncclSocketToHost(&auxPeer->peer->addr, rasLine, sizeof(rasLine)), + (__builtin_popcountll(auxPeer->peer->cudaDevs) > 1 ? "s" : ""), + rasGpuDevsToString(auxPeer->peer->cudaDevs, auxPeer->peer->nvmlDevs, lineBuf, sizeof(lineBuf))); } - if (nPeersReSorted != nRasDeadPeers) + if (nPeersBuf != nRasDeadPeers) rasOutAppend(" [could not find information on %d process%s]\n", - nRasDeadPeers-nPeersReSorted, (nRasDeadPeers-nPeersReSorted > 1 ? "es" : "")); - free(peersReSorted); + nRasDeadPeers-nPeersBuf, (nRasDeadPeers-nPeersBuf > 1 ? "es" : "")); + free(auxPeersBuf); } // if (rasCountIsOutlier(nRasDeadPeers) rasOutAppend("\n"); } + // Continue printing the largest communicators first, as in the summary table. for (vcIdx = 0; vcIdx < nValCounts; vcIdx++) { struct rasValCount* vc; vc = valCounts+vcIdx; @@ -1154,23 +1269,28 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { comm = auxComm->comm; if (auxComm->errors & RAS_ACE_INCOMPLETE) { - int nRanksMissing = comm->commNRanks - comm->nRanks; rasOutAppend("#%d-%d (%016lx) INCOMPLETE\n" " Missing communicator data from %d rank%s\n", vcIdx, commIdx - vc->firstIdx, - comm->commHash, nRanksMissing, (nRanksMissing > 1 ? "s" : "")); - if (rasCountIsOutlier(nRanksMissing, client->verbose)) { - lineBuf[0] = '\0'; - // rankIdx indexes the comm->ranks array; in principle it should be the same as commRank, with the - // exception of the missing ranks... - for (int commRank = 0, rankIdx = 0; commRank < comm->commNRanks; commRank++) { - if (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == commRank) { - rankIdx++; - } else { - snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d", - (rankIdx == commRank ? "" : ","), commRank); - } - } // for (commRank) - rasOutAppend(" The missing rank%s: %s\n", (nRanksMissing > 1 ? "s" : ""), lineBuf); + comm->commId.commHash, auxComm->nIncompleteRanks, (auxComm->nIncompleteRanks > 1 ? "s" : "")); + if (rasCountIsOutlier(auxComm->nIncompleteRanks, client->verbose)) { + struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks+comm->nRanks); + for (int rankIdx = 0; rankIdx < comm->nMissingRanks; rankIdx++) { + struct rasCollCommsMissingRank* missingRank = missingRanks + rankIdx; + // Filter out ranks that provided a response but not for this communicator. + if (bsearch(&missingRank->addr, coll->peers, coll->nPeers, sizeof(*coll->peers), ncclSocketsCompare) == + nullptr) { + int peerIdx = rasPeerFind(&missingRank->addr); + if (peerIdx != -1) { + rasOutAppend(" Rank %d -- GPU %s managed by process %d on node %s\n", + missingRank->commRank, + rasGpuToString(missingRank->cudaDev, missingRank->nvmlDev, lineBuf, sizeof(lineBuf)), + rasPeers[peerIdx].pid, + ncclSocketToHost(&missingRank->addr, rasLine, sizeof(rasLine))); + } else { + rasOutAppend(" Rank %d -- [process information not found]\n", missingRank->commRank); + } + } // if rank did not respond + } // for (rankIdx) } // if (rasCountIsOutlier(nRanksMissing)) rasOutAppend("\n"); } // if (auxComm->errors & RAS_ACE_INCOMPLETE) @@ -1178,7 +1298,7 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { if (auxComm->errors & RAS_ACE_ERROR) { int ncclErrors[ncclNumResults]; int nErrors; - rasOutAppend("#%d-%d (%016lx) ERROR\n", vcIdx, commIdx - vc->firstIdx, comm->commHash); + rasOutAppend("#%d-%d (%016lx) ERROR\n", vcIdx, commIdx - vc->firstIdx, comm->commId.commHash); memset(ncclErrors, '\0', sizeof(ncclErrors)); for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) @@ -1203,6 +1323,11 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { } // if (auxComm->errors & RAS_ACE_ERROR) } // for (commIdx) } // for (vcIdx) + msgLen = rasOutLength(); + NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail); + rasOutExtract(msg); + rasClientEnqueueMsg(client, msg, msgLen); + msg = nullptr; rasOutAppend("Warnings\n" "========\n\n"); @@ -1213,15 +1338,15 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { coll->nLegTimeouts, (coll->nLegTimeouts > 1 ? "s" : "")); } + // Continue printing the largest communicators first, as in the summary table. for (int vcIdx = 0; vcIdx < nValCounts; vcIdx++) { struct rasValCount* vc = valCounts+vcIdx; for (int commIdx = vc->firstIdx; commIdx < vc->count + vc->firstIdx; commIdx++) { - bool inconsistent; struct rasAuxComm* auxComm = auxComms+commIdx; comm = auxComm->comm; if (auxComm->errors & RAS_ACE_MISMATCH) { - rasOutAppend("#%d-%d (%016lx) MISMATCH\n", vcIdx, commIdx - vc->firstIdx, comm->commHash); + rasOutAppend("#%d-%d (%016lx) MISMATCH\n", vcIdx, commIdx - vc->firstIdx, comm->commId.commHash); if (collOpCounts == nullptr) { // Allocating comm->commNRanks elements ensures that we won't need to reallocate, because the valCounts @@ -1234,28 +1359,31 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { rasOutAppend(" Communicator ranks have different status\n"); // We need to sort the ranks by status. However, status is normally calculated from other fields. - // We will copy the ranks and reuse collOpCount to store it. - memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted)); + // We will store it in the auxCommRanks' value. for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { - struct rasCollComms::comm::rank* rank = ranksReSorted+rankIdx; + struct rasCollComms::comm::rank* rank = comm->ranks+rankIdx; + struct rasAuxCommRank* auxRank = auxCommRanks+rankIdx; + auxRank->rank = rank; if (rank->status.abortFlag) - rank->collOpCount = RAS_ACS_ABORT; + auxRank->value = RAS_ACS_ABORT; else if (rank->status.finalizeCalled || rank->status.destroyFlag) - rank->collOpCount = RAS_ACS_FINALIZE; + auxRank->value = RAS_ACS_FINALIZE; else if (rank->status.initState == ncclSuccess) - rank->collOpCount = RAS_ACS_RUNNING; + auxRank->value = RAS_ACS_RUNNING; else - rank->collOpCount = RAS_ACS_INIT; + auxRank->value = RAS_ACS_INIT; } - qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksCollOpCompare); + qsort(auxCommRanks, comm->nRanks, sizeof(*auxCommRanks), rasAuxCommRanksValueCompare); // Calculate the frequency of different status values. int nCollOpCounts = 0; for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { - if (rankIdx == 0 || ranksReSorted[rankIdx].collOpCount != ranksReSorted[rankIdx-1].collOpCount) { + if (rankIdx == 0 || auxCommRanks[rankIdx].value != auxCommRanks[rankIdx-1].value) { // __builtin_clz returns the number of leading 0-bits. This makes it possible to translate the - // status (which is a bitmask) into an array index. - collOpCounts[nCollOpCounts].value = (sizeof(unsigned int)*8-1) - __builtin_clz(ranksReSorted[rankIdx].collOpCount); + // status (which is a bitmask) into an array index. The argument is an unsigned int (there is no + // 64-bit version seemingly, but we don't actually need one here). + collOpCounts[nCollOpCounts].value = + (sizeof(unsigned int)*8-1) - __builtin_clz((unsigned int)auxCommRanks[rankIdx].value); collOpCounts[nCollOpCounts].count = 1; collOpCounts[nCollOpCounts].firstIdx = rankIdx; nCollOpCounts++; @@ -1263,11 +1391,10 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { collOpCounts[nCollOpCounts-1].count++; } } - if (comm->nRanks < comm->commNRanks) { - // Add a "fake" element corresponding to the missing entries. The statusStr array contains the "UNKNOWN" - // string at index 0. - collOpCounts[nCollOpCounts].value = 0; - collOpCounts[nCollOpCounts].count = comm->commNRanks - comm->nRanks; + if (comm->nMissingRanks - auxComm->nIncompleteRanks > 0) { + // Add a "fake" element corresponding to the NOCOMM entries, since they are not in the ranks array. + collOpCounts[nCollOpCounts].value = 0; // The index of "NOCOMM" in statusStr. + collOpCounts[nCollOpCounts].count = comm->nMissingRanks - auxComm->nIncompleteRanks; collOpCounts[nCollOpCounts].firstIdx = -1; // "Fake" entry identifier. nCollOpCounts++; } @@ -1280,114 +1407,159 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { rasOutAppend(" %d ranks have status %s\n", vcc->count, statusStr[vcc->value]); if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) { if (vcc->firstIdx != -1) { - // ranksReSorted is sorted by rank as the secondary key, which comes in handy when printing... + // auxCommRanks is sorted by commRank as the secondary key, which comes in handy when printing... for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) { - int peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx]; + int peerIdx = peerIdxConv[auxCommRanks[rankIdx].rank->peerIdx]; if (peerIdx != -1) { if (vcc->count > 1) rasOutAppend(" Rank %d -- GPU %s managed by process %d on node %s\n", - ranksReSorted[rankIdx].commRank, - rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)), + auxCommRanks[rankIdx].rank->commRank, + rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)), rasPeers[peerIdx].pid, ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine))); else rasOutAppend(" Rank %d has status %s -- GPU %s managed by process %d on node %s\n", - ranksReSorted[rankIdx].commRank, statusStr[vcc->value], - rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)), + auxCommRanks[rankIdx].rank->commRank, statusStr[vcc->value], + rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)), rasPeers[peerIdx].pid, ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine))); } else { // peerIdx == -1 if (vcc->count > 1) - rasOutAppend(" Rank %d -- [process information not found]\n", ranksReSorted[rankIdx].commRank); + rasOutAppend(" Rank %d -- [process information not found]\n", + auxCommRanks[rankIdx].rank->commRank); else rasOutAppend(" Rank %d has status %s -- [process information not found]\n", - ranksReSorted[rankIdx].commRank, statusStr[vcc->value]); + auxCommRanks[rankIdx].rank->commRank, statusStr[vcc->value]); } // peerIdx == -1 } // for (rankIdx) } else { - // UNKNOWN ranks. Format a string with their rank numbers (we don't know anything more). - lineBuf[0] = '\0'; - // rankIdx indexes the comm->ranks array; in principle it should be the same as commRank, with the - // exception of the missing ranks... - for (int commRank = 0, rankIdx = 0; commRank < comm->commNRanks; commRank++) { - if (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == commRank) { - rankIdx++; - } else { - snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d", - (rankIdx == commRank ? "" : ","), commRank); - } - } // for (commRank) - if (vcc->count > 1) { - rasOutAppend(" The unknown ranks: %s\n", lineBuf); - } else { - rasOutAppend(" Rank %s has status %s\n", lineBuf, statusStr[vcc->value]); - } - } + // NOCOMM ranks are in a different array. + struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks + + comm->nRanks); + for (int rankIdx = 0; rankIdx < comm->nMissingRanks; rankIdx++) { + struct rasCollCommsMissingRank* missingRank = missingRanks + rankIdx; + // Filter out ranks that did not respond at all. + if (bsearch(&missingRank->addr, coll->peers, coll->nPeers, sizeof(*coll->peers), + ncclSocketsCompare)) { + int peerIdx = rasPeerFind(&missingRank->addr); + if (peerIdx != -1) { + if (vcc->count > 1) { + rasOutAppend(" Rank %d -- GPU %s managed by process %d on node %s\n", + missingRank->commRank, rasGpuToString(missingRank->cudaDev, missingRank->nvmlDev, + lineBuf, sizeof(lineBuf)), + rasPeers[peerIdx].pid, + ncclSocketToHost(&missingRank->addr, rasLine, sizeof(rasLine))); + } else { + rasOutAppend(" Rank %d has status %s -- GPU %s managed by process %d on node %s\n", + missingRank->commRank, statusStr[vcc->value], + rasGpuToString(missingRank->cudaDev, missingRank->nvmlDev, + lineBuf, sizeof(lineBuf)), rasPeers[peerIdx].pid, + ncclSocketToHost(&missingRank->addr, rasLine, sizeof(rasLine))); + } + } else { // peerIdx == -1 + if (vcc->count > 1) { + rasOutAppend(" Rank %d -- [process information not found]\n", missingRank->commRank); + } else { + rasOutAppend(" Rank %d has status %s -- [process information not found]\n", + missingRank->commRank, statusStr[vcc->value]); + } + } // peerIdx == -1 + } // if rank responded + } // for (rankIdx) + } // vcc->firstIdx == -1 } // if (rasCountIsOutlier(vcc->count)) } // for (coc) } // if (__builtin_popcount(auxComm->status) > 1) - inconsistent = false; - for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { - if (comm->ranks[rankIdx].collOpCount != auxComm->firstCollOpCount) { - inconsistent = true; - break; - } - } - if (inconsistent) { - rasOutAppend(" Communicator ranks have different collective operation counts\n"); + for (int collIdx = 0; collIdx < NCCL_NUM_FUNCTIONS; collIdx++) { + bool inconsistent = false; - // Sort the ranks by collOpCount and rank for easy counting. - memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted)); - qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksCollOpCompare); - // Calculate the frequency of different collOpCount values. - int nCollOpCounts = 0; for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { - if (rankIdx == 0 || ranksReSorted[rankIdx].collOpCount != ranksReSorted[rankIdx-1].collOpCount) { - collOpCounts[nCollOpCounts].value = ranksReSorted[rankIdx].collOpCount; - collOpCounts[nCollOpCounts].count = 1; - collOpCounts[nCollOpCounts].firstIdx = rankIdx; - nCollOpCounts++; - } else { - collOpCounts[nCollOpCounts-1].count++; + if (comm->ranks[rankIdx].collOpCounts[collIdx] != auxComm->firstCollOpCounts[collIdx]) { + inconsistent = true; + break; } } - // Sort by that frequency (most frequent first). - qsort(collOpCounts, nCollOpCounts, sizeof(*collOpCounts), rasValCountsCompareRev); - for (int coc = 0; coc < nCollOpCounts; coc++) { - struct rasValCount* vcc = collOpCounts+coc; - if (vcc->count > 1) - rasOutAppend(" %d ranks have launched up to operation %ld\n", vcc->count, vcc->value); - if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) { - // ranksReSorted is sorted by rank as the secondary key, which comes in handy when printing... - for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) { - int peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx]; - if (peerIdx != -1) { - if (vcc->count > 1) - rasOutAppend(" Rank %d -- GPU %s managed by process %d on node %s\n", - ranksReSorted[rankIdx].commRank, - rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)), - rasPeers[peerIdx].pid, - ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine))); - else - rasOutAppend(" Rank %d has launched up to operation %ld -- GPU %s managed by process %d on node %s\n", - ranksReSorted[rankIdx].commRank, vcc->value, - rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)), - rasPeers[peerIdx].pid, - ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine))); - } else { // peerIdx == -1 - if (vcc->count > 1) - rasOutAppend(" Rank %d -- [process information not found]\n", ranksReSorted[rankIdx].commRank); - else - rasOutAppend(" Rank %d has launched up to operation %ld -- [process information not found]\n", - ranksReSorted[rankIdx].commRank, vcc->value); - } // peerIdx == -1 - } // for (rankIdx) - } // if (rasCountIsOutlier(vcc->count)) - } // for (coc) - } // if (inconsistent) - rasOutAppend("\n"); + if (inconsistent) { + rasOutAppend(" Communicator ranks have different %s operation counts\n", ncclFuncStr[collIdx]); + + // Sort the ranks by collOpCounts[collIdx] and commRank for easy counting. + for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { + struct rasCollComms::comm::rank* rank = comm->ranks+rankIdx; + struct rasAuxCommRank* auxRank = auxCommRanks+rankIdx; + auxRank->rank = rank; + auxRank->value = rank->collOpCounts[collIdx]; + } + qsort(auxCommRanks, comm->nRanks, sizeof(*auxCommRanks), rasAuxCommRanksValueCompare); + // Calculate the frequency of different collOpCounts[collIdx] values. + int nCollOpCounts = 0; + for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) { + if (rankIdx == 0 || auxCommRanks[rankIdx].value != auxCommRanks[rankIdx-1].value) { + collOpCounts[nCollOpCounts].value = auxCommRanks[rankIdx].value; + collOpCounts[nCollOpCounts].count = 1; + collOpCounts[nCollOpCounts].firstIdx = rankIdx; + nCollOpCounts++; + } else { + collOpCounts[nCollOpCounts-1].count++; + } + } + // Sort by that frequency (most frequent first). + qsort(collOpCounts, nCollOpCounts, sizeof(*collOpCounts), rasValCountsCompareRev); + + for (int coc = 0; coc < nCollOpCounts; coc++) { + struct rasValCount* vcc = collOpCounts+coc; + if (vcc->count > 1) { + if (vcc->value > 0) + rasOutAppend(" %d ranks have launched up to operation %ld\n", vcc->count, vcc->value); + else + rasOutAppend(" %d ranks have not launched any operations\n", vcc->count); + } + if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) { + // auxCommRanks is sorted by commRank as the secondary key, which comes in handy when printing... + for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) { + int peerIdx = peerIdxConv[auxCommRanks[rankIdx].rank->peerIdx]; + if (peerIdx != -1) { + if (vcc->count > 1) { + rasOutAppend(" Rank %d -- GPU %s managed by process %d on node %s\n", + auxCommRanks[rankIdx].rank->commRank, + rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)), + rasPeers[peerIdx].pid, + ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine))); + } else { + if (vcc->value > 0) { + rasOutAppend(" Rank %d has launched up to operation %ld -- GPU %s managed by process %d " + "on node %s\n", auxCommRanks[rankIdx].rank->commRank, vcc->value, + rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)), + rasPeers[peerIdx].pid, + ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine))); + } else { + rasOutAppend(" Rank %d has not launched any operations -- GPU %s managed by process %d " + "on node %s\n", auxCommRanks[rankIdx].rank->commRank, + rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)), + rasPeers[peerIdx].pid, + ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine))); + } + } + } else { // peerIdx == -1 + if (vcc->count > 1) { + rasOutAppend(" Rank %d -- [process information not found]\n", + auxCommRanks[rankIdx].rank->commRank); + } else { + if (vcc->value > 0) + rasOutAppend(" Rank %d has launched up to operation %ld -- [process information not found]\n", + auxCommRanks[rankIdx].rank->commRank, vcc->value); + else + rasOutAppend(" Rank %d has not launched any operations -- [process information not found]\n", + auxCommRanks[rankIdx].rank->commRank); + } + } // peerIdx == -1 + } // for (rankIdx) + } // if (rasCountIsOutlier(vcc->count)) + } // for (coc) + rasOutAppend("\n"); + } // if (inconsistent) + } // for (collIdx) } // if (auxComm->errors & RAS_ACE_MISMATCH) } // for (commIdx) } // for (vcIdx) @@ -1398,20 +1570,26 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) { rasOutExtract(msg); rasClientEnqueueMsg(client, msg, msgLen); msg = nullptr; + + TRACE(NCCL_RAS, "RAS: rasClientRunComms: finishing"); exit: free(peerNvmlDevs); free(collOpCounts); free(valCounts); free(peerIdxConv); - free(ranksReSorted); + free(auxCommRanks); free(auxComms); return ret; fail: goto exit; } +// Generates detailed info about encountered errors, be it initialization ones or asynchronous ones. static void rasClientBreakDownErrors(struct rasClient* client, struct rasCollComms::comm* comm, const int* peerIdxConv, int ncclErrors[ncclNumResults], bool isAsync) { + // Because the number of possible error kinds is finite and small, we don't bother in this case with allocating + // temporary data structures, counting the errors, sorting arrays, etc. Instead, in each iteration we pick the most + // numerous error kind, we iterate through the ranks in search for this error, and immediately add it to the output. for (;;) { int maxCount = 0; ncclResult_t maxCountIdx = ncclSuccess; @@ -1489,17 +1667,20 @@ static void rasOutAppend(const char* format, ...) { } nRasOutBuffer += needed; - assert(nRasOutBuffer <= rasOutBufferSize); + if (nRasOutBuffer >= rasOutBufferSize) + nRasOutBuffer = rasOutBufferSize - 1; // Should never happen, but just to be extra sure... exit: ; } // Copies the output data from an internal buffer to a user-supplied one, including the terminating '\0'. // The user buffer must already be allocated and be at least rasOutLength() bytes long (which includes -// the terminating '\0'). +// the terminating '\0'). Resets the output buffer when done. static void rasOutExtract(char* buffer) { - if (rasOutBuffer) + if (rasOutBuffer) { memcpy(buffer, rasOutBuffer, rasOutLength()); + rasOutReset(); + } } // Returns the current length of the used portion of the output buffer, *not* including the terminating '\0'. @@ -1524,60 +1705,25 @@ static void rasOutReset() { // Various sorting callbacks used when grouping/formatting data. // /////////////////////////////////////////////////////////////////// -// Sorting callback for rasPeerInfo elements. Sorts by the number of bits set in cudaDevs. Uses the host IP as the -// secondary key and the process id as the tertiary key. -static int rasPeersNGpuCompare(const void* e1, const void* e2) { - const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1; - const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2; - int c1 = __builtin_popcountll(p1->cudaDevs); - int c2 = __builtin_popcountll(p2->cudaDevs); - - if (c1 == c2) { - // Host IP address is the secondary key. - int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr); - if (cmp == 0) { - // Process ID is the tertiary key. - cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0)); - } - return cmp; - } else { - return (c1 < c2 ? -1 : 1); - } -} - -// Sorting callback for rasPeerInfo elements. Sorts by the number of peers per node, which we store in cudaDevs. -// Uses the host IP as the secondary key and the process id as the tertiary key. -static int rasPeersNProcsCompare(const void* e1, const void* e2) { - const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1; - const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2; +// Sorting callback for rasAuxPeerInfo elements. Sorts by value, with the peers host IP as the secondary key and +// the process id as the tertiary key. +static int rasAuxPeersValueCompare(const void* e1, const void* e2) { + const struct rasAuxPeerInfo* p1 = (const struct rasAuxPeerInfo*)e1; + const struct rasAuxPeerInfo* p2 = (const struct rasAuxPeerInfo*)e2; - if (p1->cudaDevs == p2->cudaDevs) { + if (p1->value == p2->value) { // Host IP address is the secondary key. - int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr); + int cmp = ncclSocketsHostCompare(&p1->peer->addr, &p2->peer->addr); if (cmp == 0) { // Process ID is the tertiary key. - cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0)); + cmp = (p1->peer->pid < p2->peer->pid ? -1 : (p1->peer->pid > p2->peer->pid ? 1 : 0)); } return cmp; } else { - return (p1->cudaDevs < p2->cudaDevs ? -1 : 1); + return (p1->value < p2->value ? -1 : 1); } } -// Sorting callback for rasPeerInfo elements. Sorts by the host IP and the process id as the secondary key (rather -// than the port). -static int rasPeersHostPidCompare(const void* e1, const void* e2) { - const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1; - const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2; - - int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr); - if (cmp == 0) { - // Process ID is the secondary key. - cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0)); - } - return cmp; -} - // Sorting callback for ncclSocketAddress. Unlike the ncclSocketsCompare, it ignores the port. static int ncclSocketsHostCompare(const void* p1, const void* p2) { const union ncclSocketAddress* a1 = (const union ncclSocketAddress*)p1; @@ -1599,7 +1745,8 @@ static int ncclSocketsHostCompare(const void* p1, const void* p2) { cmp = memcmp(&a1->sin6.sin6_addr, &a2->sin6.sin6_addr, sizeof(a1->sin6.sin6_addr)); } else { // The only remaining valid case are empty addresses. - assert(family == 0); + if (family != 0) + INFO(NCCL_RAS, "RAS invalid address family %d -- internal error?", family); cmp = 0; // Two empty addresses are equal... } @@ -1657,24 +1804,16 @@ static int rasAuxCommsCompareRev(const void* p1, const void* p2) { } } -// Sorting callback for rasCollComms::comm::rank elements. Sorts by the peerIdx. -static int rasCommRanksPeerCompare(const void* p1, const void* p2) { - const struct rasCollComms::comm::rank* r1 = (const struct rasCollComms::comm::rank*)p1; - const struct rasCollComms::comm::rank* r2 = (const struct rasCollComms::comm::rank*)p2; - - return (r1->peerIdx < r2->peerIdx ? -1 : (r1->peerIdx > r2->peerIdx ? 1 : 0)); -} +// Sorting callback for rasAuxCommRank elements. Sorts by value, with rank's commRank as the secondary key. +static int rasAuxCommRanksValueCompare(const void* p1, const void* p2) { + const struct rasAuxCommRank* r1 = (const struct rasAuxCommRank*)p1; + const struct rasAuxCommRank* r2 = (const struct rasAuxCommRank*)p2; -// Sorting callback for rasCollComms::comm::rank elements. Sorts by the collOpCount, with rank as the secondary key. -static int rasCommRanksCollOpCompare(const void* p1, const void* p2) { - const struct rasCollComms::comm::rank* r1 = (const struct rasCollComms::comm::rank*)p1; - const struct rasCollComms::comm::rank* r2 = (const struct rasCollComms::comm::rank*)p2; - - if (r1->collOpCount == r2->collOpCount) { - // Use the rank as the secondary key. - return (r1->commRank < r2->commRank ? -1 : (r1->commRank > r2->commRank ? 1 : 0)); + if (r1->value == r2->value) { + // Use the commRank as the secondary key. + return (r1->rank->commRank < r2->rank->commRank ? -1 : (r1->rank->commRank > r2->rank->commRank ? 1 : 0)); } else { - return (r1->collOpCount < r2->collOpCount ? -1 : 1); + return (r1->value < r2->value ? -1 : 1); } } @@ -1705,16 +1844,22 @@ const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, return buf; } -// Formats a GPU string based on the rasCollComms's rank. If the CUDA id is different from the NVML id, both are +// Formats a GPU string based on the CUDA/NVML ids provided. If the CUDA id is different from the NVML id, both are // printed. -static const char* rasCommRankGpuToString(const struct rasCollComms::comm::rank* rank, char* buf, size_t size) { - snprintf(buf, size, "%d", rank->cudaDev); - if (rank->cudaDev != rank->nvmlDev) { - snprintf(buf+strlen(buf), size-strlen(buf), " (NVML %d)", rank->nvmlDev); +static const char* rasGpuToString(int cudaDev, int nvmlDev, char* buf, size_t size) { + snprintf(buf, size, "%d", cudaDev); + if (cudaDev != nvmlDev) { + snprintf(buf+strlen(buf), size-strlen(buf), " (NVML %d)", nvmlDev); } return buf; } +// Formats a GPU string based on the rasCollComms's rank. If the CUDA id is different from the NVML id, both are +// printed. +static const char* rasCommRankGpuToString(const struct rasCollComms::comm::rank* rank, char* buf, size_t size) { + return rasGpuToString(rank->cudaDev, rank->nvmlDev, buf, size); +} + // Converts a NCCL error result to a string. static const char* ncclErrorToString(ncclResult_t err) { switch (err) { @@ -1753,3 +1898,21 @@ static bool rasCountIsOutlier(int count, bool verbose, int totalCount) { (totalCount == -1 || count <= totalCount * RAS_CLIENT_OUTLIER_FRACTION); } } + +// Invoked during RAS termination to release all the allocated resources. +void rasClientSupportTerminate() { + (void)close(rasClientListeningSocket); + rasClientListeningSocket = -1; + + free(rasOutBuffer); + rasOutBuffer = nullptr; + nRasOutBuffer = rasOutBufferSize = 0; + + for (struct rasClient* client = rasClientsHead; client;) { + struct rasClient* clientNext = client->next; + rasClientTerminate(client); + client = clientNext; + } + + // rasClientsHead and rasClientsTail are taken care of by rasClientTerminate(). +} diff --git a/src/ras/collectives.cc b/src/ras/collectives.cc index 201144f1a..72833604f 100644 --- a/src/ras/collectives.cc +++ b/src/ras/collectives.cc @@ -4,7 +4,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#define NDEBUG // Comment out duriyng development only! +#define NDEBUG // Comment out during development only! #include #include @@ -12,6 +12,7 @@ #include "checks.h" #include "comm.h" #include "nccl.h" +#include "transport.h" #include "utils.h" #include "ras_internal.h" @@ -32,14 +33,14 @@ static int nRasCollHistory, rasCollHistNextIdx; // Monotonically increased to ensure that each collective originating locally has a unique Id. static uint64_t rasCollLastId; -// Array keeping track of ongoing collective operations (apart from broadcasts, which have no response so require +// Keeping track of ongoing collective operations (apart from broadcasts, which have no response so require // no such tracking). -struct rasCollective* rasCollectives; -static int nRasCollectives; +struct rasCollective* rasCollectivesHead; +struct rasCollective* rasCollectivesTail; static ncclResult_t getNewCollEntry(struct rasCollective** pColl); static ncclResult_t rasLinkSendCollReq(struct rasLink* link, struct rasCollective* coll, - const struct rasCollRequest* req, size_t reqLen, int fromConnIdx); + const struct rasCollRequest* req, size_t reqLen, struct rasConnection* fromConn); static ncclResult_t rasConnSendCollReq(struct rasConnection* conn, const struct rasCollRequest* req, size_t reqLen); static ncclResult_t rasCollReadyResp(struct rasCollective* coll); static ncclResult_t rasConnSendCollResp(struct rasConnection* conn, @@ -47,12 +48,17 @@ static ncclResult_t rasConnSendCollResp(struct rasConnection* conn, const union ncclSocketAddress* peers, int nPeers, const char* data, int nData, int nLegTimeouts); -static ncclResult_t rasCollConnsInit(char** pData, int* pNData); +static ncclResult_t rasCollConnsInit(struct rasCollRequest** pReq, size_t* pReqLen, char** pData, int* pNData); static ncclResult_t rasCollConnsMerge(struct rasCollective* coll, struct rasMsg* msg); -static ncclResult_t rasCollCommsInit(char** pData, int* pNData); +static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqLen, char** pData, int* pNData); static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* msg); +static bool rasCollCommsSkipMissing(const struct rasCollRequest* req, struct ncclComm* comm); static int ncclCommsCompare(const void* p1, const void* p2); +static int peersHashesCompare(const void* p1, const void* p2); +static int peersHashesSearch(const void* k, const void* e); +static int rasCommIdCompare(const void* p1, const void* p2); +static int rasCollCommsMissingRankSearch(const void* k, const void* e); /////////////////////////////////////////////////////////////////////////////////////// @@ -62,22 +68,26 @@ static int ncclCommsCompare(const void* p1, const void* p2); // Returns the index of the first available entry in the rasCollectives array, enlarging the array if necessary. static ncclResult_t getNewCollEntry(struct rasCollective** pColl) { struct rasCollective* coll; - int i; - for (i = 0; i < nRasCollectives; i++) - if (rasCollectives[i].type == RAS_MSG_NONE) - break; - if (i == nRasCollectives) { - NCCLCHECK(ncclRealloc(&rasCollectives, nRasCollectives, nRasCollectives+RAS_INCREMENT)); - nRasCollectives += RAS_INCREMENT; - } + int nRasConns; + + NCCLCHECK(ncclCalloc(&coll, 1)); - coll = rasCollectives+i; - memset(coll, '\0', sizeof(*coll)); coll->startTime = clockNano(); - coll->fromConnIdx = -1; + coll->fromConn = nullptr; // We are unlikely to use the whole array, but at least we won't need to realloc. + nRasConns = 0; + for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) + nRasConns++; NCCLCHECK(ncclCalloc(&coll->fwdConns, nRasConns)); + if (rasCollectivesHead) { + rasCollectivesTail->next = coll; + coll->prev = rasCollectivesTail; + rasCollectivesTail = coll; + } else { + rasCollectivesHead = rasCollectivesTail = coll; + } + *pColl = coll; return ncclSuccess; } @@ -95,21 +105,23 @@ void rasCollReqInit(struct rasCollRequest* req) { // in preparation for collective response messages. // pAllDone indicates on return if the collective operation is already finished, which is unusual, but possible // in scenarios such as a total of two peers. -// pCollIdx provides on return an index of the allocated rasCollective structure to track this collective (unless +// pColl provides on return a pointer to the allocated rasCollective structure to track this collective (unless // it's a broadcast, which require no such tracking). -ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, bool* pAllDone, int* pCollIdx, - int fromConnIdx) { +ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, bool* pAllDone, + struct rasCollective** pColl, struct rasConnection* fromConn) { struct rasCollective* coll = nullptr; + struct rasCollRequest* reqMod = (struct rasCollRequest*)req; + size_t reqLen = 0; if (req->type >= RAS_COLL_CONNS) { // Keep track of this collective operation so that we can handle the responses appropriately. NCCLCHECK(getNewCollEntry(&coll)); - if (pCollIdx) - *pCollIdx = coll-rasCollectives; + if (pColl) + *pColl = coll; memcpy(&coll->rootAddr, &req->rootAddr, sizeof(coll->rootAddr)); coll->rootId = req->rootId; coll->type = req->type; coll->timeout = req->timeout; - coll->fromConnIdx = fromConnIdx; + coll->fromConn = fromConn; if (ncclCalloc(&coll->peers, 1) == ncclSuccess) { memcpy(coll->peers, &rasNetListeningSocket.addr, sizeof(*coll->peers)); coll->nPeers = 1; @@ -117,9 +129,9 @@ ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, // Collective-specific initialization of accumulated data (using local data for now). if (req->type == RAS_COLL_CONNS) - (void)rasCollConnsInit(&coll->data, &coll->nData); + (void)rasCollConnsInit(&reqMod, &reqLen, &coll->data, &coll->nData); else if (req->type == RAS_COLL_COMMS) - (void)rasCollCommsInit(&coll->data, &coll->nData); + (void)rasCollCommsInit(&reqMod, &reqLen, &coll->data, &coll->nData); } else { // req->type < RAS_COLL_CONNS // Add the info to the collective message history. nRasCollHistory = std::min(nRasCollHistory+1, COLL_HISTORY_SIZE); @@ -131,42 +143,42 @@ ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, // Collective-specific message handling. if (req->type == RAS_BC_DEADPEER) { bool done = false; - rasMsgHandleBCDeadPeer(req, &done); + rasMsgHandleBCDeadPeer(&reqMod, &reqLen, &done); if (done) goto exit; } } // req->type < RAS_COLL_CONNS - for (int connIdx = 0; connIdx < nRasConns; connIdx++) - rasConns[connIdx].linkFlag = false; + for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) + conn->linkFlag = false; - (void)rasLinkSendCollReq(&rasNextLink, coll, req, reqLen, fromConnIdx); - (void)rasLinkSendCollReq(&rasPrevLink, coll, req, reqLen, fromConnIdx); + (void)rasLinkSendCollReq(&rasNextLink, coll, reqMod, reqLen, fromConn); + (void)rasLinkSendCollReq(&rasPrevLink, coll, reqMod, reqLen, fromConn); if (coll && pAllDone) *pAllDone = (coll->nFwdSent == coll->nFwdRecv); exit: + if (reqMod != req) + free(reqMod); return ncclSuccess; } // Sends the collective message through all connections associated with this link (with the exception of the one // the message came from, if any). static ncclResult_t rasLinkSendCollReq(struct rasLink* link, struct rasCollective* coll, - const struct rasCollRequest* req, size_t reqLen, int fromConnIdx) { - for (int i = 0; i < link->nConns; i++) { - struct rasLinkConn* linkConn = link->conns+i; - if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) { - struct rasConnection* conn = rasConns+linkConn->connIdx; - if (!conn->linkFlag) { - // We send collective messages through fully established and operational connections only. - if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY && !conn->experiencingDelays) { - if (rasConnSendCollReq(conn, req, reqLen) == ncclSuccess && coll != nullptr) - coll->fwdConns[coll->nFwdSent++] = linkConn->connIdx; - } // if (conn->sockIdx != -1 && RAS_SOCK_READY) - conn->linkFlag = true; - } // if (!conn->linkFlag) - } // if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) - } // for (i) + const struct rasCollRequest* req, size_t reqLen, + struct rasConnection* fromConn) { + for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next) { + if (linkConn->conn && linkConn->conn != fromConn && !linkConn->conn->linkFlag) { + // We send collective messages through fully established and operational connections only. + if (linkConn->conn->sock && linkConn->conn->sock->status == RAS_SOCK_READY && + !linkConn->conn->experiencingDelays) { + if (rasConnSendCollReq(linkConn->conn, req, reqLen) == ncclSuccess && coll != nullptr) + coll->fwdConns[coll->nFwdSent++] = linkConn->conn; + } // linkConn->conn is fully established and operational. + linkConn->conn->linkFlag = true; + } // if (linkConn->conn && linkConn->conn != fromConn && !linkConn->con->linkFlag) + } // for (linkConn) return ncclSuccess; } @@ -190,8 +202,8 @@ static ncclResult_t rasConnSendCollReq(struct rasConnection* conn, const struct // in which case it can immediately send the response. ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) { bool allDone = false; - int collIdx = -1; - assert(sock->connIdx != -1); + struct rasCollective* coll = nullptr; + assert(sock->conn); // First check if we've already handled this request (through another connection). for (int i = 0; i < nRasCollHistory; i++) { @@ -202,7 +214,7 @@ ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) { if (msg->collReq.type >= RAS_COLL_CONNS) { // Send an empty response so that the sender can account for it. The non-empty response has already been // sent through the connection that we received the request through first. - NCCLCHECK(rasConnSendCollResp(rasConns+sock->connIdx, &msg->collReq.rootAddr, msg->collReq.rootId, + NCCLCHECK(rasConnSendCollResp(sock->conn, &msg->collReq.rootAddr, msg->collReq.rootId, /*peers*/nullptr, /*nPeers*/0, /*data*/nullptr, /*nData*/0, /*nLegTimeouts*/0)); } goto exit; @@ -211,31 +223,29 @@ ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) { if (msg->collReq.type >= RAS_COLL_CONNS) { // Check if we're currently handling this collective request. - for (int i = 0; i < nRasCollectives; i++) { - struct rasCollective* coll = rasCollectives+i; - if (coll->type != RAS_MSG_NONE && - memcmp(&msg->collReq.rootAddr, &coll->rootAddr, sizeof(msg->collReq.rootAddr)) == 0 && + for (coll = rasCollectivesHead; coll; coll = coll->next) { + if (memcmp(&msg->collReq.rootAddr, &coll->rootAddr, sizeof(msg->collReq.rootAddr)) == 0 && msg->collReq.rootId == coll->rootId) { assert(msg->collReq.type == coll->type); // Send an empty response so that the sender can account for it. The non-empty response will be // sent through the connection that we received the request through first. - NCCLCHECK(rasConnSendCollResp(rasConns+sock->connIdx, &msg->collReq.rootAddr, msg->collReq.rootId, + NCCLCHECK(rasConnSendCollResp(sock->conn, &msg->collReq.rootAddr, msg->collReq.rootId, /*peers*/nullptr, /*nPeers*/0, /*data*/nullptr, /*nData*/0, /*nLegTimeouts*/0)); goto exit; } // if match - } // for (i) + } // for (coll) } // if (msg->collReq.type >= RAS_COLL_CONNS) // Re-broadcast the message to my peers (minus the one it came from) and handle it locally. - NCCLCHECK(rasNetSendCollReq(&msg->collReq, rasCollDataLength(msg->collReq.type), &allDone, &collIdx, sock->connIdx)); + NCCLCHECK(rasNetSendCollReq(&msg->collReq, &allDone, &coll, sock->conn)); if (msg->collReq.type >= RAS_COLL_CONNS && allDone) { - assert(collIdx != -1); + assert(coll); // We are a leaf process -- send the response right away. This can probably trigger only for the case of a total // of two peers, and hence just one RAS connection, or during communication issues, because normally every peer // has more than one connection so there should always be _some_ other peer to forward the request to. - NCCLCHECK(rasCollReadyResp(rasCollectives+collIdx)); + NCCLCHECK(rasCollReadyResp(coll)); } exit: return ncclSuccess; @@ -245,9 +255,9 @@ ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) { // Invoked when we are finished waiting for the collective responses from other peers (i.e., either there weren't // any peers (unlikely), the peers sent their responses (likely), or we timed out. static ncclResult_t rasCollReadyResp(struct rasCollective* coll) { - if (coll->fromConnIdx != -1) { + if (coll->fromConn) { // For remotely-initiated collectives, send the response back. - NCCLCHECK(rasConnSendCollResp(rasConns+coll->fromConnIdx, &coll->rootAddr, coll->rootId, + NCCLCHECK(rasConnSendCollResp(coll->fromConn, &coll->rootAddr, coll->rootId, coll->peers, coll->nPeers, coll->data, coll->nData, coll->nLegTimeouts)); // Add the identifying info to the collective message history. @@ -302,18 +312,15 @@ static ncclResult_t rasConnSendCollResp(struct rasConnection* conn, // the data from the response into the accumulated data. If all the responses have been accounted for, sends the // accumulated response back. ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock) { - int collIdx; - struct rasCollective* coll = nullptr; + struct rasCollective* coll; char line[SOCKET_NAME_MAXLEN+1]; - for (collIdx = 0; collIdx < nRasCollectives; collIdx++) { - coll = rasCollectives+collIdx; - if (coll->type != RAS_MSG_NONE && - memcmp(&msg->collResp.rootAddr, &coll->rootAddr, sizeof(msg->collResp.rootAddr)) == 0 && + for (coll = rasCollectivesHead; coll; coll = coll->next) { + if (memcmp(&msg->collResp.rootAddr, &coll->rootAddr, sizeof(msg->collResp.rootAddr)) == 0 && msg->collResp.rootId == coll->rootId) break; } - if (collIdx == nRasCollectives) { + if (coll == nullptr) { INFO(NCCL_RAS, "RAS failed to find a matching ongoing collective for response %s:%ld from %s!", ncclSocketToString(&msg->collResp.rootAddr, line), msg->collResp.rootId, ncclSocketToString(&sock->sock.addr, rasLine)); @@ -321,11 +328,11 @@ ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock) { } coll->nLegTimeouts += msg->collResp.nLegTimeouts; - assert(sock->connIdx != -1); - // Account for the received response in our collective operation tracking. + assert(sock->conn); + // Account for the received response in our collective operations tracking. for (int i = 0; i < coll->nFwdSent; i++) { - if (coll->fwdConns[i] == sock->connIdx) { - coll->fwdConns[i] = -1; + if (coll->fwdConns[i] == sock->conn) { + coll->fwdConns[i] = nullptr; break; } } @@ -353,46 +360,53 @@ ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock) { // Removes a connection from all ongoing collectives. Called when a connection is experiencing a delay or is being // terminated. -void rasCollsPurgeConn(int connIdx) { - for (int i = 0; i < nRasCollectives; i++) { - struct rasCollective* coll = rasCollectives+i; - if (coll->type != RAS_MSG_NONE) { - char line[SOCKET_NAME_MAXLEN+1]; - if (coll->fromConnIdx == connIdx) { - INFO(NCCL_RAS, "RAS purging collective %s:%ld because it comes from %s", - ncclSocketToString(&coll->rootAddr, line), coll->rootId, - ncclSocketToString(&rasConns[connIdx].addr, rasLine)); - rasCollFree(coll); - } else { - for (int j = 0; j < coll->nFwdSent; j++) { - if (coll->fwdConns[j] == connIdx) { - coll->fwdConns[j] = -1; - coll->nFwdRecv++; - coll->nLegTimeouts++; - INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld " - "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)", - ncclSocketToString(&rasConns[connIdx].addr, rasLine), ncclSocketToString(&coll->rootAddr, line), - coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts); - if (coll->nFwdSent == coll->nFwdRecv) - (void)rasCollReadyResp(coll); - break; - } - } // for (j) - } // coll->fromConnIdx != connIdx - } // !RAS_MSG_NONE - } // for (i) +void rasCollsPurgeConn(struct rasConnection* conn) { + for (struct rasCollective* coll = rasCollectivesHead; coll;) { + struct rasCollective* collNext = coll->next; + char line[SOCKET_NAME_MAXLEN+1]; + if (coll->fromConn == conn) { + INFO(NCCL_RAS, "RAS purging collective %s:%ld because it comes from %s", + ncclSocketToString(&coll->rootAddr, line), coll->rootId, + ncclSocketToString(&conn->addr, rasLine)); + rasCollFree(coll); + } else { + for (int i = 0; i < coll->nFwdSent; i++) { + if (coll->fwdConns[i] == conn) { + coll->fwdConns[i] = nullptr; + coll->nFwdRecv++; + coll->nLegTimeouts++; + INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld " + "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)", + ncclSocketToString(&conn->addr, rasLine), ncclSocketToString(&coll->rootAddr, line), coll->rootId, + coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts); + if (coll->nFwdSent == coll->nFwdRecv) + (void)rasCollReadyResp(coll); + break; + } + } // for (i) + } // coll->fromConn != conn + coll = collNext; + } // for (coll) } // Frees a rasCollective entry and any memory associated with it. void rasCollFree(struct rasCollective* coll) { + if (coll == nullptr) + return; + free(coll->fwdConns); - coll->fwdConns = nullptr; free(coll->peers); - coll->peers = nullptr; free(coll->data); - coll->data = nullptr; - coll->fromConnIdx = -1; - coll->type = RAS_MSG_NONE; + + if (coll == rasCollectivesHead) + rasCollectivesHead = rasCollectivesHead->next; + if (coll == rasCollectivesTail) + rasCollectivesTail = rasCollectivesTail->prev; + if (coll->prev) + coll->prev->next = coll->next; + if (coll->next) + coll->next->prev = coll->prev; + free(coll); } // Invoked from the main RAS thread loop to handle timeouts of the collectives. @@ -407,64 +421,64 @@ void rasCollFree(struct rasCollective* coll) { // and send back whatever we have. Unfortunately, the peer that the RAS client is connected to will in all likelihood // time out first, so at that point any delayed responses that eventually arrive are likely to be too late... void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup) { - for (int collIdx = 0; collIdx < nRasCollectives; collIdx++) { - struct rasCollective* coll = rasCollectives+collIdx; - if (coll->type == RAS_MSG_NONE || coll->timeout == 0) - continue; - - if (now - coll->startTime > coll->timeout) { - // We've exceeded the leg timeout. For all outstanding responses, check their connections. - if (!coll->timeoutWarned) { - INFO(NCCL_RAS, "RAS collective %s:%ld timeout warning (%lds) -- %d responses missing", - ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId, - (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv); - coll->timeoutWarned = true; - } - for (int i = 0; i < coll->nFwdSent; i++) { - if (coll->fwdConns[i] != -1) { - struct rasConnection* conn = rasConns+coll->fwdConns[i]; - char line[SOCKET_NAME_MAXLEN+1]; - if (!conn->experiencingDelays && conn->sockIdx != -1) { - struct rasSocket* sock = rasSockets+conn->sockIdx; - // Ensure that the connection is fully established and operational, and that the socket hasn't been - // re-created during the handling of the collective (which would suggest that the request may have been - // lost). - if (sock->status == RAS_SOCK_READY && sock->createTime < coll->startTime) - continue; - } - // In all other cases we declare a timeout so that we can (hopefully) recover. - INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld " - "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)", - ncclSocketToString(&conn->addr, rasLine), ncclSocketToString(&coll->rootAddr, line), - coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts); - coll->fwdConns[i] = -1; - coll->nFwdRecv++; - coll->nLegTimeouts++; - } // if (coll->fwdConns[i] != -1) - } // for (i) - if (coll->nFwdSent == coll->nFwdRecv) { - (void)rasCollReadyResp(coll); - } else { - // At least some of the delays are *not* due to this process' connections experiencing delays, i.e., they - // must be due to delays at other processes. Presumably those processes will give up waiting soon and the - // (incomplete) responses will arrive shortly, so we should wait a little longer. - if (now - coll->startTime > coll->timeout + RAS_COLLECTIVE_EXTRA_TIMEOUT) { - // We've exceeded even the longer timeout, which is unexpected. Try to return whatever we have (though - // the originator of the collective, if it's not us, may have timed out already anyway). - INFO(NCCL_RAS, "RAS collective %s:%ld timeout error (%lds) -- giving up on %d missing responses", + for (struct rasCollective* coll = rasCollectivesHead; coll;) { + struct rasCollective* collNext = coll->next; + if (coll->timeout > 0) { + if (now - coll->startTime > coll->timeout) { + // We've exceeded the leg timeout. For all outstanding responses, check their connections. + if (!coll->timeoutWarned) { + INFO(NCCL_RAS, "RAS collective %s:%ld timeout warning (%lds) -- %d responses missing", ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId, (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv); - coll->nLegTimeouts += coll->nFwdSent - coll->nFwdRecv; - coll->nFwdRecv = coll->nFwdSent; + coll->timeoutWarned = true; + } + for (int i = 0; i < coll->nFwdSent; i++) { + if (coll->fwdConns[i]) { + struct rasConnection* conn = coll->fwdConns[i]; + char line[SOCKET_NAME_MAXLEN+1]; + if (!conn->experiencingDelays && conn->sock) { + // Ensure that the connection is fully established and operational, and that the socket hasn't been + // re-created during the handling of the collective (which would suggest that the request may have been + // lost). + if (conn->sock->status == RAS_SOCK_READY && conn->sock->createTime < coll->startTime) + continue; + } + // In all other cases we declare a timeout so that we can (hopefully) recover. + INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld " + "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)", + ncclSocketToString(&conn->addr, rasLine), ncclSocketToString(&coll->rootAddr, line), + coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts); + coll->fwdConns[i] = nullptr; + coll->nFwdRecv++; + coll->nLegTimeouts++; + } // if (coll->fwdConns[i]) + } // for (i) + if (coll->nFwdSent == coll->nFwdRecv) { (void)rasCollReadyResp(coll); } else { - *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout+RAS_COLLECTIVE_EXTRA_TIMEOUT); - } - } // conn->nFwdRecv < conn->nFwdSent - } else { - *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout); - } - } // for (collIdx) + // At least some of the delays are *not* due to this process' connections experiencing delays, i.e., they + // must be due to delays at other processes. Presumably those processes will give up waiting soon and the + // (incomplete) responses will arrive shortly, so we should wait a little longer. + if (now - coll->startTime > coll->timeout + RAS_COLLECTIVE_EXTRA_TIMEOUT) { + // We've exceeded even the longer timeout, which is unexpected. Try to return whatever we have (though + // the originator of the collective, if it's not us, may have timed out already anyway). + INFO(NCCL_RAS, "RAS collective %s:%ld timeout error (%lds) -- giving up on %d missing responses", + ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId, + (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv); + coll->nLegTimeouts += coll->nFwdSent - coll->nFwdRecv; + coll->nFwdRecv = coll->nFwdSent; + (void)rasCollReadyResp(coll); + } else { + *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout+RAS_COLLECTIVE_EXTRA_TIMEOUT); + } + } // conn->nFwdRecv < conn->nFwdSent + } else { + *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout); + } + } // if (coll->timeout > 0) + + coll = collNext; + } // for (coll) } @@ -476,15 +490,16 @@ void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup) { // For this particular collective, we keep some reduced statistical data (min/max/avg travel time) as well // as connection-specific info in case we observed a negative min travel time (which, ideally, shouldn't happen, // but the system clocks may not be perfectly in sync). -static ncclResult_t rasCollConnsInit(char** pData, int* pNData) { +static ncclResult_t rasCollConnsInit(struct rasCollRequest** pReq, size_t* pReqLen, char** pData, int* pNData) { struct rasCollConns connsData = {.travelTimeMin = INT64_MAX, .travelTimeMax = INT64_MIN}; struct rasCollConns* pConnsData; + *pReqLen = rasCollDataLength(RAS_COLL_CONNS); + // Update the statistical data first and in the process also calculate how much connection-specific space we // will need. - for (int i = 0; i < nRasConns; i++) { - struct rasConnection* conn = rasConns+i; - if (conn->inUse && conn->travelTimeCount > 0) { + for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) { + if (conn->travelTimeCount > 0) { if (connsData.travelTimeMin > conn->travelTimeMin) connsData.travelTimeMin = conn->travelTimeMin; if (connsData.travelTimeMax < conn->travelTimeMax) @@ -502,9 +517,9 @@ static ncclResult_t rasCollConnsInit(char** pData, int* pNData) { pConnsData = (struct rasCollConns*)*pData; memcpy(pConnsData, &connsData, sizeof(*pConnsData)); if (connsData.nNegativeMins > 0) { - for (int i = 0, negMinsIdx = 0; i < nRasConns; i++) { - struct rasConnection* conn = rasConns+i; - if (conn->inUse && conn->travelTimeMin < 0) { + int negMinsIdx = 0; + for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) { + if (conn->travelTimeMin < 0) { struct rasCollConns::negativeMin* negativeMin = pConnsData->negativeMins+negMinsIdx; memcpy(&negativeMin->source, &rasNetListeningSocket.addr, sizeof(negativeMin->source)); memcpy(&negativeMin->dest, &conn->addr, sizeof(negativeMin->dest)); @@ -560,10 +575,26 @@ static ncclResult_t rasCollConnsMerge(struct rasCollective* coll, struct rasMsg* // Initializes the accumulated data with just the local data for now. // For this particular collective, we keep for every communicator information about every rank, to help identify // the missing ones and the discrepancies between the ones that did respond. -static ncclResult_t rasCollCommsInit(char** pData, int* pNData) { +// For any new (previously unseen) communicator we also save the basic identification data about every rank that is +// "missing" (i.e., not part of this process). During merging, this should be replaced by the actual data from +// those ranks, if they are responsive. We want to provide this information to the user (so that we can say more +// than "rank xyz missing"). +// Every "new" communicator is also recorded in the (updated) request, so that when that request is forwarded to our +// peers, those peers don't needlessly send us the same data. +static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqLen, char** pData, int* pNData) { + ncclResult_t ret = ncclSuccess; struct rasCollComms* commsData; - int nComms = 0, nRanks = 0; + int nComms = 0, nRanks = 0, nMissingRanks = 0; + bool skipMissing = false; std::lock_guard lock(ncclCommsMutex); + struct rasCollComms::comm* comm; + struct rasCollRequest* req = nullptr; + struct rasPeerInfo** peersReSorted = nullptr; + int firstNewSkipMissingIdx = -1; + + *pReqLen = rasCollDataLength(RAS_COLL_COMMS) + + (*pReq)->comms.nSkipMissingRanksComms * sizeof(*(*pReq)->comms.skipMissingRanksComms); + *pData = nullptr; // Start by counting the communicators so that we know how much space to allocate. // We also need to sort the comms array, to make the subsequent merging easier, both between the ranks (in case @@ -572,77 +603,152 @@ static ncclResult_t rasCollCommsInit(char** pData, int* pNData) { qsort(ncclComms, nNcclComms, sizeof(*ncclComms), &ncclCommsCompare); ncclCommsSorted = true; } - for (int i = 0; i < nNcclComms; i++) { - if (ncclComms[i] == nullptr) // nullptr's are always at the end after sorting. + for (int commIdx = 0; commIdx < nNcclComms; commIdx++) { + if (ncclComms[commIdx] == nullptr) // nullptr's are always at the end after sorting. break; - if (i == 0) { - nComms = 1; - } else if (ncclComms[i]->commHash != ncclComms[i-1]->commHash) { + // A process may manage multiple GPUs and thus have multiple communicators with the same commHash. + // Comparing just the commHash is OK though within communicators that are part of the same process. + if (commIdx == 0 || ncclComms[commIdx]->commHash != ncclComms[commIdx-1]->commHash) { + skipMissing = rasCollCommsSkipMissing(*pReq, ncclComms[commIdx]); + if (!skipMissing) { + // Add this communicator to the request so that the processes we forward the request to know not to fill in + // the missing rank info. + struct rasCommId* skipComm; + if (req == nullptr) { + // We pessimistically allocate space for all the remaining communicators so that we don't need to reallocate. + int newSize = *pReqLen + (nNcclComms-commIdx) * sizeof(*req->comms.skipMissingRanksComms); + NCCLCHECKGOTO(ncclCalloc((char**)&req, newSize), ret, fail); + memcpy(req, *pReq, *pReqLen); + *pReq = req; + firstNewSkipMissingIdx = req->comms.nSkipMissingRanksComms; + } + skipComm = req->comms.skipMissingRanksComms + req->comms.nSkipMissingRanksComms++; + skipComm->commHash = ncclComms[commIdx]->commHash; + skipComm->hostHash = ncclComms[commIdx]->peerInfo->hostHash; + skipComm->pidHash = ncclComms[commIdx]->peerInfo->pidHash; + + nMissingRanks += ncclComms[commIdx]->nRanks; + } // if (!skipMissing) nComms++; - } + } // if encountered a new communicator nRanks++; - } + if (!skipMissing) + nMissingRanks--; + } // for (commIdx) - // rasNetCollCommsData has nested variable-length arrays, which makes the size calculation and subsequent + // rasCollComms has nested variable-length arrays, which makes the size calculation and subsequent // pointer manipulations somewhat unwieldy... - *pNData = sizeof(*commsData) + nComms * sizeof(*commsData->comms) + nRanks * sizeof(*commsData->comms[0].ranks); - NCCLCHECK(ncclCalloc(pData, *pNData)); + // This is extra complicated because of the "hidden" array of struct rasCollCommsMissingRank following the + // ranks array for each communicator. + *pNData = sizeof(*commsData) + nComms * sizeof(*commsData->comms) + nRanks * sizeof(*commsData->comms[0].ranks) + + nMissingRanks * sizeof(struct rasCollCommsMissingRank); + NCCLCHECKGOTO(ncclCalloc(pData, *pNData), ret, fail); commsData = (struct rasCollComms*)*pData; commsData->nComms = nComms; // comm points at the space in the accumulated data where the info about the current communicator is to be stored. - struct rasCollComms::comm* comm = commsData->comms; - for (int i = 0; i < nNcclComms; i++) { - struct rasCollComms::comm::rank* rank; - ncclResult_t asyncError; - if (ncclComms[i] == nullptr) - break; - if (i == 0 || ncclComms[i]->commHash != ncclComms[i-1]->commHash) { - if (i > 0) - comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks)); - comm->commHash = ncclComms[i]->commHash; - comm->commNRanks = ncclComms[i]->nRanks; - comm->nRanks = 0; - } else if (ncclComms[i]->nRanks != ncclComms[i-1]->nRanks) { - INFO(NCCL_RAS, "RAS encountered inconsistent communicator data: size %d != %d -- " - "possible commHash collision (0x%lx)", ncclComms[i-1]->nRanks, ncclComms[i]->nRanks, comm->commHash); - continue; // Short of failing, the best we can do is skip... - } else if (ncclComms[i]->rank == ncclComms[i-1]->rank) { - INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible commHash collision (0x%lx)", - ncclComms[i]->rank, comm->commHash); - continue; // Short of failing, the best we can do is skip... - } - if (comm->nRanks == comm->commNRanks) { - INFO(NCCL_RAS, - "RAS encountered more ranks than the communicator size (%d) -- possible commHash collision (0x%lx)", - comm->commNRanks, comm->commHash); - continue; // Short of failing, the best we can do is skip... - } - rank = comm->ranks+comm->nRanks; - rank->commRank = ncclComms[i]->rank; - // rasNetSendCollReq initializes coll->peers[0] to our rasNetListeningSocket.addr, so peerIdx is initially - // always 0. It will increase after we send this response back to the peer we got the request from. - rank->peerIdx = 0; - rank->collOpCount = ncclComms[i]->collOpCount; - rank->status.initState = ncclComms[i]->initState; - if (ncclCommGetAsyncError(ncclComms[i], &asyncError) == ncclSuccess) - rank->status.asyncError = asyncError; - rank->status.finalizeCalled = (ncclComms[i]->finalizeCalled != 0); - rank->status.destroyFlag = (ncclComms[i]->destroyFlag != 0); - rank->status.abortFlag = (__atomic_load_n(ncclComms[i]->abortFlag, __ATOMIC_ACQUIRE) != 0); - rank->cudaDev = ncclComms[i]->cudaDev; - rank->nvmlDev = ncclComms[i]->nvmlDev; - comm->nRanks++; - } - assert(nComms == 0 || ((char*)(comm->ranks+comm->nRanks)) - (char*)commsData <= *pNData); + comm = commsData->comms; + // collCommIdx counts rasCollComms::comm (comm); commIdx indexes ncclComms. + for (int collCommIdx = 0, commIdx = 0; collCommIdx < nComms; collCommIdx++) { + struct ncclComm* ncclComm = ncclComms[commIdx]; + + comm->commId.commHash = ncclComm->commHash; + comm->commId.hostHash = ncclComm->peerInfo->hostHash; + comm->commId.pidHash = ncclComm->peerInfo->pidHash; + comm->commNRanks = ncclComm->nRanks; + comm->nRanks = comm->nMissingRanks = 0; + + // Fill in the comm->ranks array. + for (; commIdx < nNcclComms && ncclComms[commIdx] && ncclComms[commIdx]->commHash == comm->commId.commHash; + commIdx++) { + ncclComm = ncclComms[commIdx]; + struct rasCollComms::comm::rank* rank = comm->ranks+comm->nRanks; + ncclResult_t asyncError; + rank->commRank = ncclComm->rank; + // rasNetSendCollReq initializes coll->peers[0] to our rasNetListeningSocket.addr, so peerIdx is initially + // always 0. It will increase after we send this response back to the peer we got the request from. + rank->peerIdx = 0; + memcpy(rank->collOpCounts, ncclComm->seqNumber, sizeof(rank->collOpCounts)); + rank->status.initState = ncclComm->initState; + if (ncclCommGetAsyncError(ncclComm, &asyncError) == ncclSuccess) + rank->status.asyncError = asyncError; + rank->status.finalizeCalled = (ncclComm->finalizeCalled != 0); + rank->status.destroyFlag = (ncclComm->destroyFlag != 0); + rank->status.abortFlag = (__atomic_load_n(ncclComm->abortFlag, __ATOMIC_ACQUIRE) != 0); + rank->cudaDev = ncclComm->cudaDev; + rank->nvmlDev = ncclComm->nvmlDev; + comm->nRanks++; + } // for (commIdx) + + if (firstNewSkipMissingIdx != -1 && + memcmp(req->comms.skipMissingRanksComms+firstNewSkipMissingIdx, &comm->commId, sizeof(comm->commId)) == 0) { + // Fill in the missingRanks array that follows the comm->ranks. + struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks+comm->nRanks); + + if (peersReSorted == nullptr) { + // Create a lookup table to rasPeers that is sorted by hostHash and pidHash, to reduce the complexity of the + // lookups in the missingRankIdx loop below. + NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasPeers), ret, fail); + for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) + peersReSorted[peerIdx] = rasPeers+peerIdx; + qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), peersHashesCompare); + } - return ncclSuccess; + comm->nMissingRanks = comm->commNRanks - comm->nRanks; + for (int missingRankIdx = 0, rankIdx = 0; missingRankIdx < comm->nMissingRanks; missingRankIdx++) { + struct rasCollCommsMissingRank* missingRank; + struct ncclPeerInfo* info; + struct rasPeerInfo** peer; + uint64_t key[2]; + // Look for the next "hole" in the ranks array. + while (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == rankIdx+missingRankIdx) + rankIdx++; + + missingRank = missingRanks + missingRankIdx; + missingRank->commRank = rankIdx + missingRankIdx; + info = ncclComm->peerInfo + missingRank->commRank; + key[0] = info->hostHash - ncclComm->commHash; + key[1] = info->pidHash - ncclComm->commHash; + peer = (struct rasPeerInfo**)bsearch(key, peersReSorted, nRasPeers, sizeof(*peersReSorted), peersHashesSearch); + if (peer) + memcpy(&missingRank->addr, &(*peer)->addr, sizeof(missingRank->addr)); + missingRank->cudaDev = info->cudaDev; + missingRank->nvmlDev = info->nvmlDev; + } // for (missingRankIdx) + + if (++firstNewSkipMissingIdx == req->comms.nSkipMissingRanksComms) + firstNewSkipMissingIdx = -1; + } // if need to fill in the missingRanks + + comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks) + + comm->nMissingRanks * sizeof(struct rasCollCommsMissingRank)); + } // for (collCommIdx) + assert(((char*)comm) - (char*)commsData <= *pNData); + + if (req) { + // Finish updating the request. + *pReqLen = rasCollDataLength(RAS_COLL_COMMS) + + req->comms.nSkipMissingRanksComms * sizeof(*req->comms.skipMissingRanksComms); + qsort(req->comms.skipMissingRanksComms, req->comms.nSkipMissingRanksComms, + sizeof(*req->comms.skipMissingRanksComms), rasCommIdCompare); + } +ret: + free(peersReSorted); + return ret; +fail: + if (req) { + free(req); + *pReq = nullptr; + } + free(*pData); + *pData = nullptr; + goto ret; } // Merges incoming collective RAS_COLL_COMMS response message into the local accumulated data. static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* msg) { - struct rasCollComms* collData; - struct rasCollComms* msgData; + struct rasCollComms* collData; // Data previously stored (locally) by our process. + struct rasCollComms* msgData; // Data just received from another process. int dataOffset = rasMsgLength(RAS_MSG_COLLRESP) + msg->collResp.nPeers*sizeof(*msg->collResp.peers); ALIGN_SIZE(dataOffset, alignof(int64_t)); @@ -650,7 +756,7 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* collData = (struct rasCollComms*)coll->data; if (msgData->nComms > 0) { - struct rasCollComms* newData = nullptr; + struct rasCollComms* newData = nullptr; // Destination buffer for the merged data. // Allocate the new buffer pessimistically (sized as the sum of the two old ones). NCCLCHECK(ncclCalloc((char**)&newData, coll->nData + msg->collResp.nData)); @@ -661,25 +767,28 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* for (int collIdx = 0, msgIdx = 0; collIdx < collData->nComms || msgIdx < msgData->nComms; newData->nComms++) { int cmp; if (collIdx < collData->nComms && msgIdx < msgData->nComms) - cmp = (collComm->commHash < msgComm->commHash ? -1 : (collComm->commHash > msgComm->commHash ? 1 : 0)); + cmp = rasCommIdCompare(&collComm->commId, &msgComm->commId); else cmp = (collIdx < collData->nComms ? -1 : 1); if (cmp == 0 && collComm->commNRanks != msgComm->commNRanks) { INFO(NCCL_RAS, "RAS encountered inconsistent communicator data: size %d != %d -- " - "possible commHash collision (0x%lx)", collComm->commNRanks, msgComm->commNRanks, collComm->commHash); + "possible hash collision (0x%lx, 0x%lx, 0x%lx)", collComm->commNRanks, msgComm->commNRanks, + collComm->commId.commHash, collComm->commId.hostHash, collComm->commId.pidHash); cmp = (collComm->commNRanks < msgComm->commNRanks ? -1 : 1); - // We try to preserve both separately, although the input data might already be messed up anyway... + // We try to preserve them both separately... } if (cmp == 0) { // Merge the comms. - newComm->commHash = collComm->commHash; + memcpy(&newComm->commId, &collComm->commId, sizeof(newComm->commId)); newComm->commNRanks = collComm->commNRanks; if (collComm->nRanks + msgComm->nRanks > collComm->commNRanks) { INFO(NCCL_RAS, - "RAS encountered more ranks (%d) than the communicator size (%d) -- possible commHash collision (0x%lx)", - collComm->nRanks + msgComm->nRanks, newComm->commNRanks, newComm->commHash); + "RAS encountered more ranks (%d) than the communicator size (%d) -- possible hash collision " + "(0x%lx, 0x%lx, 0x%lx)", collComm->nRanks + msgComm->nRanks, newComm->commNRanks, + collComm->commId.commHash, collComm->commId.hostHash, collComm->commId.pidHash); + newComm->nRanks = newComm->commNRanks; // We'll skip the extras in the loop below. } else { newComm->nRanks = collComm->nRanks + msgComm->nRanks; @@ -691,16 +800,18 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* int cmpRank; if (newRankIdx == newComm->commNRanks) break; // Short of failing, the best we can do is skip... - if (collRankIdx < collComm->nRanks && msgRankIdx < msgComm->nRanks) + if (collRankIdx < collComm->nRanks && msgRankIdx < msgComm->nRanks) { cmpRank = (collComm->ranks[collRankIdx].commRank < msgComm->ranks[msgRankIdx].commRank ? -1 : (collComm->ranks[collRankIdx].commRank > msgComm->ranks[msgRankIdx].commRank ? 1 : 0)); - else + } else { cmpRank = (collRankIdx < collComm->nRanks ? -1 : 1); + } // There shouldn't be any overlaps in ranks between different sources. if (cmpRank == 0) { - INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible commHash collision (0x%lx)", - collComm->ranks[collRankIdx].commRank, newComm->commHash); + INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible hash collision " + "(0x%lx, 0x%lx, 0x%lx)", collComm->ranks[collRankIdx].commRank, + newComm->commId.commHash, newComm->commId.hostHash, newComm->commId.pidHash); msgRankIdx++; // Short of failing, the best we can do is skip... } memcpy(newComm->ranks+newRankIdx, (cmpRank <= 0 ? collComm->ranks+collRankIdx++ : @@ -708,23 +819,63 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* if (cmpRank > 0) { // peerIdx values from msgComm need to shift after merge. newComm->ranks[newRankIdx].peerIdx += coll->nPeers; - } + + if (collComm->nMissingRanks > 0) { + // Remove the corresponding entry from missingRanks. + struct rasCollCommsMissingRank* missingRank; + missingRank = (struct rasCollCommsMissingRank*)bsearch(&newComm->ranks[newRankIdx].commRank, + collComm->ranks+collComm->nRanks, + collComm->nMissingRanks, + sizeof(struct rasCollCommsMissingRank), + rasCollCommsMissingRankSearch); + if (missingRank) { + // Mark the entry as no longer needed. + memset(&missingRank->addr, '\0', sizeof(missingRank->addr)); + } else { + INFO(NCCL_RAS, "RAS failed to find missingRank data -- internal error?"); + } + } // if (collComm->nMissingRanks > 0) + } // if (cmpRank > 0) } // for (newRankIdx) - newComm = (struct rasCollComms::comm*)(((char*)(newComm+1)) + newComm->nRanks * sizeof(*newComm->ranks)); - collComm = (struct rasCollComms::comm*)(((char*)(collComm+1)) + collComm->nRanks * sizeof(*collComm->ranks)); + if (collComm->nMissingRanks > 0) { + // Copy the missingRanks to newComm, skipping over any no longer needed entries. + union ncclSocketAddress emptyAddr; + struct rasCollCommsMissingRank* collMissingRanks; + struct rasCollCommsMissingRank* newMissingRanks; + int newRankIdx; + + memset(&emptyAddr, '\0', sizeof(emptyAddr)); + collMissingRanks = (struct rasCollCommsMissingRank*)(collComm->ranks+collComm->nRanks); + newMissingRanks = (struct rasCollCommsMissingRank*)(newComm->ranks+newComm->nRanks); + newRankIdx = 0; + for (int collRankIdx = 0; collRankIdx < collComm->nMissingRanks; collRankIdx++) { + if (memcmp(&collMissingRanks[collRankIdx].addr, &emptyAddr, sizeof(emptyAddr))) { + memcpy(newMissingRanks + newRankIdx++, collMissingRanks + collRankIdx, sizeof(*newMissingRanks)); + } + } + newComm->nMissingRanks = newRankIdx; + assert(newComm->nRanks + newComm->nMissingRanks == newComm->commNRanks); + } + newComm = (struct rasCollComms::comm*)(((char*)(newComm+1)) + newComm->nRanks * sizeof(*newComm->ranks) + + newComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank)); + collComm = (struct rasCollComms::comm*)(((char*)(collComm+1)) + collComm->nRanks * sizeof(*collComm->ranks) + + collComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank)); collIdx++; - msgComm = (struct rasCollComms::comm*)(((char*)(msgComm+1)) + msgComm->nRanks * sizeof(*msgComm->ranks)); + msgComm = (struct rasCollComms::comm*)(((char*)(msgComm+1)) + msgComm->nRanks * sizeof(*msgComm->ranks) + + msgComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank)); msgIdx++; } else if (cmp < 0) { // Copy from collComm. - int commSize = sizeof(*collComm) + collComm->nRanks * sizeof(*collComm->ranks); + int commSize = sizeof(*collComm) + collComm->nRanks * sizeof(*collComm->ranks) + + collComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank); memcpy(newComm, collComm, commSize); newComm = (struct rasCollComms::comm*)(((char*)(newComm)) + commSize); collComm = (struct rasCollComms::comm*)(((char*)(collComm)) + commSize); collIdx++; } else { // cmp > 0 // Copy from msgComm. - int commSize = sizeof(*msgComm) + msgComm->nRanks * sizeof(*msgComm->ranks); + int commSize = sizeof(*msgComm) + msgComm->nRanks * sizeof(*msgComm->ranks) + + msgComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank); memcpy(newComm, msgComm, commSize); for (int i = 0; i < newComm->nRanks; i++) { // peerIdx values from msgComm need to shift after merge. @@ -745,18 +896,87 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* return ncclSuccess; } +// Checks if a given communicator is in the skipMissingRanksComms array of the request. +static bool rasCollCommsSkipMissing(const struct rasCollRequest* req, struct ncclComm* comm) { + struct rasCommId id; + id.commHash = comm->commHash; + id.hostHash = comm->peerInfo->hostHash; + id.pidHash = comm->peerInfo->pidHash; + return (bsearch(&id, req->comms.skipMissingRanksComms, req->comms.nSkipMissingRanksComms, + sizeof(*req->comms.skipMissingRanksComms), rasCommIdCompare) != nullptr); +} + // Sorting callback for the ncclComms array. static int ncclCommsCompare(const void* p1, const void* p2) { - const ncclComm** pc1 = (const ncclComm**)p1; - const ncclComm** pc2 = (const ncclComm**)p2; + const ncclComm* comm1 = *(const ncclComm**)p1; + const ncclComm* comm2 = *(const ncclComm**)p2; // Put nullptr's at the end. - if (*pc1 == nullptr || *pc2 == nullptr) - return (*pc1 != nullptr ? -1 : (*pc2 != nullptr ? 1 : 0)); + if (comm1 == nullptr || comm2 == nullptr) + return (comm1 != nullptr ? -1 : (comm2 != nullptr ? 1 : 0)); - if ((*pc1)->commHash == (*pc2)->commHash) { - return ((*pc1)->rank < (*pc2)->rank ? -1 : ((*pc1)->rank > (*pc2)->rank ? 1 : 0)); + if (comm1->commHash == comm2->commHash) { + return (comm1->rank < comm2->rank ? -1 : (comm1->rank > comm2->rank ? 1 : 0)); } else { - return ((*pc1)->commHash < (*pc2)->commHash ? -1 : 1); + return (comm1->commHash < comm2->commHash ? -1 : 1); } } + +// Sorting callback for a lookup table to rasPeers. Sorts by the hostHash (primary) and pidHash (secondary). +static int peersHashesCompare(const void* p1, const void* p2) { + const struct rasPeerInfo* pi1 = *(const struct rasPeerInfo**)p1; + const struct rasPeerInfo* pi2 = *(const struct rasPeerInfo**)p2; + + if (pi1->hostHash == pi2->hostHash) { + return (pi1->pidHash < pi2->pidHash ? -1 : (pi1->pidHash > pi2->pidHash ? 1 : 0)); + } else { + return (pi1->hostHash < pi2->hostHash ? -1 : 1); + } +} + +// Search callback for a lookup table to rasPeers. Searches by the hostHash and pidHash. The key is an array +// containing the hostHash at index 0 and the pidHash at index 1. +static int peersHashesSearch(const void* k, const void* e) { + const uint64_t* key = (const uint64_t*)k; + const struct rasPeerInfo* elem = *(const struct rasPeerInfo**)e; + + if (key[0] == elem->hostHash) { + return (key[1] < elem->pidHash ? -1 : (key[1] > elem->pidHash ? 1 : 0)); + } else { + return (key[0] < elem->hostHash ? -1 : 1); + } +} + +// Sorting/searching callback for struct rasCommId. Sorts by commHash, then hostHash, then pidHash. +static int rasCommIdCompare(const void* p1, const void* p2) { + const struct rasCommId* i1 = (const struct rasCommId*)p1; + const struct rasCommId* i2 = (const struct rasCommId*)p2; + if (i1->commHash == i2->commHash) { + if (i1->hostHash == i2->hostHash) { + return (i1->pidHash < i2->pidHash ? -1 : (i1->pidHash > i2->pidHash ? 1 : 0)); + } else { + return (i1->hostHash < i2->hostHash ? -1 : 1); + } + } else { + return (i1->commHash < i2->commHash ? -1 : 1); + } +} + +// Search callback for rasCollComms::comm rasCollCommsMissingRank array. The key is the commRank. +static int rasCollCommsMissingRankSearch(const void* k, const void* e) { + int key = *(const int*)k; + const struct rasCollCommsMissingRank* elem = (const struct rasCollCommsMissingRank*)e; + + return (key < elem->commRank ? -1 : (key > elem->commRank ? 1 : 0)); +} + +// Invoked during RAS termination to release all the allocated resources. +void rasCollectivesTerminate() { + for (struct rasCollective* coll = rasCollectivesHead; coll;) { + struct rasCollective* collNext = coll->next; + rasCollFree(coll); + coll = collNext; + } + + // rasCollectivesHead and rasCollectivesTail are taken care of by rasCollFree(). +} diff --git a/src/ras/peers.cc b/src/ras/peers.cc index f2692d3e1..8573209f1 100644 --- a/src/ras/peers.cc +++ b/src/ras/peers.cc @@ -40,10 +40,11 @@ static ncclResult_t rasRanksConvertToPeers(struct rasRankInit* ranks, int nranks static ncclResult_t rasPeersUpdate(struct rasPeerInfo* rankPeers, int* nRankPeers, int newNRasPeers = -1); static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers, - struct rasRankInit* ranks = nullptr, int nranks = 0, int fromConnIdx = -1); + struct rasRankInit* ranks = nullptr, int nranks = 0, + struct rasConnection* fromConn = nullptr); static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks, - int fromConnIdx); + struct rasConnection* fromConn); static ncclResult_t rasConnPropagateUpdate(struct rasConnection* conn, const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks); ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock); @@ -146,6 +147,8 @@ static ncclResult_t rasRanksConvertToPeers(struct rasRankInit* ranks, int nranks rankPeer->pid = rank->pid; rankPeer->cudaDevs = (1UL << rank->cudaDev); rankPeer->nvmlDevs = (1UL << rank->nvmlDev); + rankPeer->hostHash = rank->hostHash; + rankPeer->pidHash = rank->pidHash; rankPeerIdx++; // Also check if there is already an entry with that address in the global rasPeers so that the caller can know how @@ -357,12 +360,12 @@ int rasPeerFind(const union ncclSocketAddress* addr) { // ranks -- if provided -- lists all the peers who are already aware of this update (because they are the members // of the new communicator being established), and who thus don't need to be notified. updatedDeadPeers can // be used, however, to request at least the propagation of rasDeadPeers to such peers. -// fromConnIdx -- if provided -- identified the connection used to receive this update; there's no need to +// fromConn -- if provided -- identifies the connection used to receive this update; there's no need to // propagate the update back through it. // Reconfigures the RAS network to accommodate the newly added peers, by modifying the links and establishing new // connections as needed. static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers, - struct rasRankInit* ranks, int nranks, int fromConnIdx) { + struct rasRankInit* ranks, int nranks, struct rasConnection* fromConn) { ncclResult_t ret = ncclSuccess; // Do we actually have anything to do? @@ -371,8 +374,8 @@ static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nN // Start by propagating the update through the RAS network links. We consider any errors during this process // to be non-fatal (we can re-sync later around a keep-alive exchange). - (void)rasLinkPropagateUpdate(&rasNextLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConnIdx); - (void)rasLinkPropagateUpdate(&rasPrevLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConnIdx); + (void)rasLinkPropagateUpdate(&rasNextLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConn); + (void)rasLinkPropagateUpdate(&rasPrevLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConn); // Calculate new link peers and open new connections if needed. NCCLCHECKGOTO(rasLinkReinitConns(&rasNextLink), ret, fail); @@ -388,15 +391,13 @@ static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nN // for the explanation of the function arguments. static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks, - int fromConnIdx) { - for (int i = 0; i < link->nConns; i++) { - struct rasLinkConn* linkConn = link->conns+i; + struct rasConnection* fromConn) { + for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next) { // Note that we don't send the update via the connection that we received this notification from in the first // place (while it wouldn't loop indefinitely, it would add a needless extra exchange). - if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) { - struct rasConnection* conn = rasConns+linkConn->connIdx; + if (linkConn->conn && linkConn->conn != fromConn) { // Failed propagations are not considered fatal (we will retry after a keep-alive). - (void)rasConnPropagateUpdate(conn, newPeers, nNewPeers, updateDeadPeers, ranks, nranks); + (void)rasConnPropagateUpdate(linkConn->conn, newPeers, nNewPeers, updateDeadPeers, ranks, nranks); } } @@ -407,7 +408,7 @@ static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct ra // arguments. static ncclResult_t rasConnPropagateUpdate(struct rasConnection* conn, const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks) { - if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY) { + if (conn->sock && conn->sock->status == RAS_SOCK_READY) { // If we have the rank info, check if the peer on the other side of this connection has participated in the new // communicator. int connRank = -1; @@ -462,7 +463,8 @@ ncclResult_t rasConnSendPeersUpdate(struct rasConnection* conn, const struct ras msg->peersUpdate.deadPeersHash = rasDeadPeersHash; msg->peersUpdate.nDeadPeers = nDeadPeers; memcpy(msg->peersUpdate.peers, peers, nPeers * sizeof(msg->peersUpdate.peers[0])); - memcpy(((char*)msg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers)); + if (nDeadPeers > 0) + memcpy(((char*)msg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers)); if (nPeers > 0) conn->lastSentPeersHash = rasPeersHash; @@ -485,8 +487,7 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock) ncclResult_t ret = ncclSuccess; struct rasMsg* newMsg = nullptr; int newMsgLen = 0; - assert(sock->connIdx != -1); - struct rasConnection* conn = rasConns+sock->connIdx; + assert(sock->conn); int nPeers, nDeadPeers; int deadPeersOffset = 0; bool updatePeers, updateDeadPeers; @@ -496,8 +497,8 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock) msg->peersUpdate.nPeers, msg->peersUpdate.nDeadPeers); INFO(NCCL_RAS, "RAS my old rasPeersHash 0x%lx, rasDeadPeersHash 0x%lx, nRasPeers %d, nRasDeadPeers %d", rasPeersHash, rasDeadPeersHash, nRasPeers, nRasDeadPeers); - conn->lastRecvPeersHash = msg->peersUpdate.peersHash; - conn->lastRecvDeadPeersHash = msg->peersUpdate.deadPeersHash; + sock->conn->lastRecvPeersHash = msg->peersUpdate.peersHash; + sock->conn->lastRecvDeadPeersHash = msg->peersUpdate.deadPeersHash; // Prepare ours to send back. We don't enqueue it right away because we want to make sure first that we need // to send it. We'll find out by comparing the hash values after the merge. @@ -545,15 +546,15 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock) rasDeadPeersDump(); // If post-merge the hashes are still different, send our (dead) peers back. - updatePeers = (conn->lastSentPeersHash != rasPeersHash && conn->lastRecvPeersHash != rasPeersHash); - updateDeadPeers = (conn->lastSentDeadPeersHash != rasDeadPeersHash && - conn->lastRecvDeadPeersHash != rasDeadPeersHash); + updatePeers = (sock->conn->lastSentPeersHash != rasPeersHash && sock->conn->lastRecvPeersHash != rasPeersHash); + updateDeadPeers = (sock->conn->lastSentDeadPeersHash != rasDeadPeersHash && + sock->conn->lastRecvDeadPeersHash != rasDeadPeersHash); if (updatePeers || updateDeadPeers) { newMsg->peersUpdate.peersHash = rasPeersHash; newMsg->peersUpdate.deadPeersHash = rasDeadPeersHash; if (updatePeers) { assert(nPeers > 0); - conn->lastSentPeersHash = rasPeersHash; + sock->conn->lastSentPeersHash = rasPeersHash; } else { // If hashes match, make sure that we don't send the rasPeers back. newMsg->peersUpdate.nPeers = 0; @@ -564,14 +565,14 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock) if (updateDeadPeers) { assert(nRasDeadPeers > 0); - conn->lastSentDeadPeersHash = rasDeadPeersHash; + sock->conn->lastSentDeadPeersHash = rasDeadPeersHash; ALIGN_SIZE(newMsgLen, alignof(union ncclSocketAddress)); deadPeersOffset = newMsgLen; newMsgLen += nRasDeadPeers*sizeof(*rasDeadPeers); memcpy(((char*)newMsg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers)); - conn->lastSentDeadPeersHash = rasDeadPeersHash; + sock->conn->lastSentDeadPeersHash = rasDeadPeersHash; newMsg->peersUpdate.nDeadPeers = nRasDeadPeers; } else { newMsg->peersUpdate.nDeadPeers = 0; @@ -580,13 +581,13 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock) INFO(NCCL_RAS, "RAS sending back a peersUpdate (nPeers %d, nDeadPeers %d)", newMsg->peersUpdate.nPeers, newMsg->peersUpdate.nDeadPeers); - rasConnEnqueueMsg(conn, newMsg, newMsgLen); + rasConnEnqueueMsg(sock->conn, newMsg, newMsgLen); newMsg = nullptr; } // if (updatePeers || updateDeadPeers) // Propagate the changes through our RAS network links. NCCLCHECKGOTO(rasNetUpdatePeers(msg->peersUpdate.peers, msg->peersUpdate.nPeers, updateDeadPeers, nullptr, 0, - sock->connIdx), ret, fail); + sock->conn), ret, fail); } exit: @@ -603,7 +604,7 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock) // Reinitializes the connection(s) of a particular link, following a peers update. // Adding new peers can affect the calculation of the link's primary connection and also the fallbacks. -// The newly added peers could also shift all the existing peerIdx values, invalidating the values in RasLinkConn +// The newly added peers could also shift all the existing peerIdx values, invalidating the values in rasLinkConn // structures, so it's better to drop it all and recalculate from scratch. // We recalculate the primary peer; if an active connection to it already exists, then we're done. If there // is no connection, we create one. If a connection exists but is experiencing delays then we add a fallback and @@ -611,77 +612,51 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock) // External conns are dropped from the links as well (they will be re-created via keepAlive messages as needed). static ncclResult_t rasLinkReinitConns(struct rasLink* link) { struct rasLinkConn* linkConn; - struct rasConnection* conn = nullptr; int newPeerIdx = myPeerIdx; - if (link->connsSize == 0) { - link->connsSize = RAS_INCREMENT; - NCCLCHECK(ncclCalloc(&link->conns, link->connsSize)); - } - link->nConns = 0; - - // Establish a connection for this link. We iterate as long as the connections we find are experiencing delays. - while (newPeerIdx != -1) { - if (link->nConns == link->connsSize) { - NCCLCHECK(ncclRealloc(&link->conns, link->connsSize, link->connsSize+RAS_INCREMENT)); - link->connsSize += RAS_INCREMENT; + if (link->conns) { + // Free the old contents but keep the first entry for convenience (though wipe it). + for (struct rasLinkConn* linkConn = link->conns->next; linkConn;) { + struct rasLinkConn* linkConnNext = linkConn->next; + free(linkConn); + linkConn = linkConnNext; } + memset(link->conns, '\0', sizeof(*link->conns)); + link->lastUpdatePeersTime = 0; + } else { // link->conns == nullptr + NCCLCHECK(ncclCalloc(&link->conns, 1)); + } - newPeerIdx = rasLinkCalculatePeer(link, newPeerIdx, /*isFallback*/link->nConns > 1); - if (newPeerIdx == -1) { - INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (nConns %d)", link->direction, link->nConns); - if (link->nConns > 0) - break; - } - linkConn = link->conns+link->nConns; - linkConn->peerIdx = newPeerIdx; - linkConn->connIdx = (newPeerIdx != -1 ? rasConnFind(&rasPeers[newPeerIdx].addr) : -1); - linkConn->external = false; - - // If the calculated connection does not exist, then we are at the end of the chain and this is the last iteration. - // Depending on the circumstances, we may first need to create that connection. - if (linkConn->connIdx == - 1) { - if (link->nConns == 0) { - if (linkConn->peerIdx != -1) { - INFO(NCCL_RAS, "RAS link %d: %s primary connection with %s", - link->direction, (myPeerIdx < linkConn->peerIdx ? "opening new" : "calculated deferred"), - ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine)); - // We try to initiate primary connections from the side with a lower address (and thus an earlier peer index) - // to avoid races and the creation of duplicate connections. - if (myPeerIdx < linkConn->peerIdx) { - NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->connIdx)); - } - else { // If we didn't initiate the connection, start the timeout. - link->lastUpdatePeersTime = clockNano(); - } - } // if (linkConn->peerIdx != -1) - } else { // link->nConns > 0 - INFO(NCCL_RAS, "RAS link %d: opening new fallback connection %d with %s", - link->direction, link->nConns, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine)); - NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &linkConn->connIdx)); - } // link->nConns > 0 - } else { // linkConn->connIdx != -1 - if (link->nConns == 0) { - INFO(NCCL_RAS, "RAS link %d: calculated existing primary connection with %s", - link->direction, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine)); - } else { - INFO(NCCL_RAS, "RAS link %d: calculated existing fallback connection %d with %s", - link->direction, link->nConns, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine)); + // Fill in the entry for the primary connection. + linkConn = link->conns; + linkConn->peerIdx = newPeerIdx = rasLinkCalculatePeer(link, myPeerIdx, /*isFallback*/false); + linkConn->conn = (newPeerIdx != -1 ? rasConnFind(&rasPeers[newPeerIdx].addr) : nullptr); + linkConn->external = false; + + if (linkConn->conn == nullptr) { + if (linkConn->peerIdx != -1) { + // We try to initiate primary connections from the side with a lower address (and thus an earlier peer index) + // to avoid races and the creation of duplicate connections. + INFO(NCCL_RAS, "RAS link %d: %s primary connection with %s", + link->direction, (myPeerIdx < linkConn->peerIdx ? "opening new" : "calculated deferred"), + ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine)); + if (myPeerIdx < linkConn->peerIdx) { + NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->conn)); } - } - link->nConns++; - if (linkConn->connIdx == -1) - break; - conn = rasConns+linkConn->connIdx; - - // We check if the connection already went through the fallback calculation; if so, we'll need to create a new - // fallback in the next iteration, to ensure that RAS will keep retrying. - if (!conn->experiencingDelays) - break; + else { // If we didn't initiate the connection, start the timeout. + link->lastUpdatePeersTime = clockNano(); + } + } // if (linkConn->peerIdx != -1) + } else { // linkConn->conn + INFO(NCCL_RAS, "RAS link %d: calculated existing primary connection with %s", + link->direction, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine)); + } // linkConn->conn + if (linkConn->conn && linkConn->conn->experiencingDelays) { INFO(NCCL_RAS, "RAS connection experiencingDelays %d, startRetryTime %.2fs, socket status %d", - conn->experiencingDelays, (clockNano()-conn->startRetryTime)/1e9, - (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status)); + linkConn->conn->experiencingDelays, (clockNano()-linkConn->conn->startRetryTime)/1e9, + (linkConn->conn->sock ? linkConn->conn->sock->status : - 1)); + NCCLCHECK(rasLinkAddFallback(link, linkConn->conn)); } return ncclSuccess; @@ -701,39 +676,37 @@ int rasLinkCalculatePeer(const struct rasLink* link, int peerIdx, bool isFallbac if (isFallback && !ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasNetListeningSocket.addr)) { // peerIdx is a fallback and it is not running on the same node as us. int tryPeerIdx = newPeerIdx; - int tryConnIdx = -1; + struct rasConnection* tryConn = nullptr; // Try to skip the remaining peers on the same node as peerIdx. We may end up skipping over some peers that // are alive, which is fine -- they will still have connectivity with the rest of the RAS network, just a // little suboptimal one. while (ncclSocketsSameNode(&rasPeers[tryPeerIdx].addr, &rasPeers[peerIdx].addr)) { if (!rasPeerIsDead(&rasPeers[tryPeerIdx].addr)) { - tryConnIdx = rasConnFind(&rasPeers[tryPeerIdx].addr); - if (tryConnIdx != -1) { - struct rasConnection* tryConn = rasConns+tryConnIdx; + tryConn = rasConnFind(&rasPeers[tryPeerIdx].addr); + if (tryConn) { // Check if the connection is fully established and operational, i.e., if the underlying socket // is ready and there's been recent communication on it. - if (tryConn->sockIdx != -1 && rasSockets[tryConn->sockIdx].status == RAS_SOCK_READY && - !tryConn->experiencingDelays) { + if (tryConn->sock && tryConn->sock->status == RAS_SOCK_READY && !tryConn->experiencingDelays) { // We convinced ourselves that the node is not down. We don't adjust newPeerIdx in // this case. This is the only case when tryConnIdx != -1 after this loop. break; } - } // if (tryConnIdx != -1) + } // if (tryConn) } // if (!rasPeerIsDead(&rasPeers[tryPeerIdx].addr)) - tryConnIdx = -1; - tryPeerIdx = (tryPeerIdx + nRasPeers + link->direction) % nRasPeers; + tryConn = nullptr; + tryPeerIdx = (tryPeerIdx + link->direction + nRasPeers) % nRasPeers; if (tryPeerIdx == myPeerIdx) break; } - if (tryConnIdx == -1) + if (tryConn == nullptr) newPeerIdx = tryPeerIdx; if (tryPeerIdx == myPeerIdx) break; } // if (isFallback && !ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasNetListeningSocket.addr)) - + if (rasPeerIsDead(&rasPeers[newPeerIdx].addr)) { newPeerIdx = (newPeerIdx + nRasPeers + link->direction) % nRasPeers; } @@ -932,7 +905,8 @@ bool ncclSocketsSameNode(const union ncclSocketAddress* a1, const union ncclSock static void rasPeersDump() { for (int p = 0; p < nRasPeers; p++) { const struct rasPeerInfo* peer = rasPeers+p; - INFO(NCCL_RAS, "RAS peer %d: %s%s", p, rasPeerDump(peer, rasLine, sizeof(rasLine)), (p == myPeerIdx ? " [this process]" : "")); + INFO(NCCL_RAS, "RAS peer %d: %s%s", p, rasPeerDump(peer, rasLine, sizeof(rasLine)), + (p == myPeerIdx ? " [this process]" : "")); } if (nRasPeers > 0) INFO(NCCL_RAS, "RAS peersHash 0x%lx", rasPeersHash); @@ -958,3 +932,17 @@ static char* rasPeerDump(const struct rasPeerInfo* peer, char* result, size_t nr rasGpuDevsToString(peer->cudaDevs, peer->nvmlDevs, line2, sizeof(line2))); return result; } + +// Invoked during RAS termination to release all the allocated resources. +void rasPeersTerminate() { + free(rasPeers); + rasPeers = nullptr; + nRasPeers = 0; + rasPeersHash = 0; + myPeerIdx = -1; + + free(rasDeadPeers); + rasDeadPeers = nullptr; + nRasDeadPeers = rasDeadPeersSize = 0; + rasDeadPeersHash = 0; +} diff --git a/src/ras/ras.cc b/src/ras/ras.cc index 4905d7a69..8ef551c64 100644 --- a/src/ras/ras.cc +++ b/src/ras/ras.cc @@ -4,8 +4,10 @@ * See LICENSE.txt for license information ************************************************************************/ -#define NDEBUG // Comment out during development only! -#include +// Workaround for libstdc++ trying to force public visibility of std:: symbols. We don't want to do that in libnccl.so. +#include +#undef _GLIBCXX_VISIBILITY +#define _GLIBCXX_VISIBILITY(V) #include #include #include @@ -65,8 +67,8 @@ int nNcclComms = 0; bool ncclCommsSorted = false; // Whether the array is currently sorted. We sort by the comms' commHash and rank. static ncclResult_t rasLocalNotify(const struct rasNotification* msg); -static ncclResult_t rasLocalHandle(); -static void rasLocalHandleTerminate(); +static ncclResult_t rasLocalHandle(bool* terminate); +static void rasThreadCleanup(); static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSocket* sock); static ncclResult_t rasMsgHandleConnInitAck(const struct rasMsg* msg, struct rasSocket* sock); @@ -74,6 +76,8 @@ static ncclResult_t rasNetSendNack(struct rasSocket* sock); static void* rasThreadMain(void*); +static void rasTerminate() __attribute__((destructor)); + NCCL_PARAM(RasTimeoutFactor, "RAS_TIMEOUT_FACTOR", 1); ////////////////////////////////////////////////// @@ -105,7 +109,6 @@ ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank) PTHREADCHECKGOTO(pthread_create(&rasThread, nullptr, &rasThreadMain, nullptr), "pthread_create", ret, fail); ncclSetThreadName(rasThread, "NCCL RAS"); - (void)pthread_detach(rasThread); rasInitialized = true; } @@ -157,18 +160,27 @@ ncclResult_t ncclRasCommFini(const struct ncclComm* comm) { } } } - if (ncclAtomicRefCountDecrement(&rasInitRefCount) == 0) { - struct rasNotification msg; - msg.type = RAS_TERMINATE; - NCCLCHECK(rasLocalNotify(&msg)); - } + ncclAtomicRefCountDecrement(&rasInitRefCount); return ncclSuccess; } +// Global destructor. Notifies the RAS thread to release all the resources +// and terminate. Waits for the thread to terminate. +static void rasTerminate() { + struct rasNotification msg; + if (!rasInitialized) + return; + memset(&msg, '\0', sizeof(msg)); + msg.type = RAS_TERMINATE; + if (rasLocalNotify(&msg) == ncclSuccess) + (void)pthread_join(rasThread, nullptr); +} + // Invoked by regular NCCL threads on every (non-split) comm initialization. Provides info on all the ranks within // the communicator. ncclResult_t ncclRasAddRanks(struct rasRankInit* ranks, int nranks) { struct rasNotification msg; + memset(&msg, '\0', sizeof(msg)); msg.type = RAS_ADD_RANKS; msg.addRanks.ranks = ranks; msg.addRanks.nranks = nranks; @@ -199,7 +211,7 @@ static ncclResult_t rasLocalNotify(const struct rasNotification* msg) { ///////////////////////////////////////////////////////////////////////////////// // Handles asynchronous local notifications arriving from regular NCCL threads. -static ncclResult_t rasLocalHandle() { +static ncclResult_t rasLocalHandle(bool* terminate) { struct rasNotification msg; size_t done = 0; @@ -212,9 +224,11 @@ static ncclResult_t rasLocalHandle() { } if (msg.type == RAS_ADD_RANKS) { - NCCLCHECK(rasLocalHandleAddRanks(msg.addRanks.ranks, msg.addRanks.nranks)); + (void)rasLocalHandleAddRanks(msg.addRanks.ranks, msg.addRanks.nranks); + // Not great if the above fails, but it shouldn't be critical; better to keep going. } else if (msg.type == RAS_TERMINATE) { - rasLocalHandleTerminate(); + INFO(NCCL_RAS, "RAS handling local termination request"); + *terminate = true; } else { WARN("RAS received unknown notification type %d", msg.type); return ncclInternalError; @@ -223,10 +237,35 @@ static ncclResult_t rasLocalHandle() { return ncclSuccess; } -// Handles local RAS_TERMINATE notification. -static void rasLocalHandleTerminate() { - INFO(NCCL_RAS, "RAS handling local termination request"); - // For now we don't do anything. +// Cleans up local RAS state, normally in response to a RAS_TERMINATE notification. +static void rasThreadCleanup() { + rasClientSupportTerminate(); + rasNetTerminate(); + rasCollectivesTerminate(); + rasPeersTerminate(); + + { + std::lock_guard lock(rasInitMutex); + (void)close(rasNotificationPipe[1]); + (void)close(rasNotificationPipe[0]); + // rasClientListeningSocket is taken care of by rasClientSupportTerminate(). + rasNotificationPipe[0] = rasNotificationPipe[1] = -1; + (void)ncclSocketClose(&rasNetListeningSocket); + rasInitRefCount = 0; + rasInitialized = false; + } + + { + std::lock_guard lock(ncclCommsMutex); + free(ncclComms); + ncclComms = nullptr; + nNcclComms = 0; + ncclCommsSorted = false; + } + + free(rasPfds); + rasPfds = nullptr; + nRasPfds = 0; } @@ -270,10 +309,10 @@ void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t ms else ncclIntruQueueEnqueue(&conn->sendQ, meta); - if (conn->sockIdx != -1) { - struct rasSocket* sock = rasSockets+conn->sockIdx; - if (sock->status == RAS_SOCK_READY || (sock->status == RAS_SOCK_HANDSHAKE && msg->type == RAS_MSG_CONNINIT)) { - rasPfds[sock->pfd].events |= POLLOUT; + if (conn->sock) { + if (conn->sock->status == RAS_SOCK_READY || + (conn->sock->status == RAS_SOCK_HANDSHAKE && msg->type == RAS_MSG_CONNINIT)) { + rasPfds[conn->sock->pfd].events |= POLLOUT; ready = true; } } @@ -283,31 +322,31 @@ void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t ms "(experiencingDelays %d, startRetryTime %.2fs, socket status %d)", msg->type, ncclSocketToString(&conn->addr, rasLine), conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0), - (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status)); + (conn->sock ? conn->sock->status : -1)); } } // Attempts to send the queued RAS messages to another RAS thread. ncclResult_t rasConnSendMsg(struct rasConnection* conn, int* closed, bool* allSent) { - struct ncclSocket* sock = &rasSockets[conn->sockIdx].sock; struct rasMsgMeta* meta; *closed = 0; while ((meta = ncclIntruQueueHead(&conn->sendQ)) != nullptr) { - if (rasSockets[conn->sockIdx].status == RAS_SOCK_HANDSHAKE && meta->msg.type != RAS_MSG_CONNINIT) { + if (conn->sock->status == RAS_SOCK_HANDSHAKE && meta->msg.type != RAS_MSG_CONNINIT) { // We don't send anything beyond the handshake at this point. meta = nullptr; break; } if (meta->offset < sizeof(meta->length)) { // Send the length of the message. - NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, &meta->length, sizeof(meta->length), &meta->offset, closed)); + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &conn->sock->sock, &meta->length, sizeof(meta->length), + &meta->offset, closed)); if (*closed) return ncclSuccess; if (meta->offset < sizeof(meta->length)) break; } // Send the body of the message. - NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, ((char*)&meta->msg)-sizeof(meta->length), + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &conn->sock->sock, ((char*)&meta->msg)-sizeof(meta->length), meta->length+sizeof(meta->length), &meta->offset, closed)); if (*closed) return ncclSuccess; @@ -377,7 +416,7 @@ ncclResult_t rasMsgHandle(struct rasMsg* msg, struct rasSocket* sock) { static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSocket* sock) { ncclResult_t ret = ncclSuccess; struct rasConnection* conn = nullptr; - int connIdx, peerIdx; + int peerIdx; struct rasMsg* newMsg = nullptr; int newMsgLen; char line[SOCKET_NAME_MAXLEN+1]; @@ -406,19 +445,16 @@ static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSoc } // Check for any existing connection with that RAS thread (could happen due to a network issue, or possibly a race). - connIdx = rasConnFind(&msg->connInit.listeningAddr); - if (connIdx != -1) { - conn = rasConns+connIdx; - + conn = rasConnFind(&msg->connInit.listeningAddr); + if (conn) { INFO(NCCL_RAS, "RAS found a matching existing connection (sendQ %sempty, experiencingDelays %d, startRetryTime %.2fs)", (ncclIntruQueueEmpty(&conn->sendQ) ? "" : "not "), conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0)); - if (conn->sockIdx != -1) { - struct rasSocket* connSock = rasSockets+conn->sockIdx; + if (conn->sock) { INFO(NCCL_RAS, "RAS found an alternative existing socket (status %d, createTime %.2fs)", - connSock->status, (clockNano()-connSock->createTime)/1e9); + conn->sock->status, (clockNano()-conn->sock->createTime)/1e9); // In general we prefer to keep the newer connection, but "newer" can be a relative term: we may have // a race where both sides attempt to establish a connection at roughly the same time, so the other side's // incoming connection ends up looking newer than the locally-initiated one -- for *both* of them. @@ -433,21 +469,19 @@ static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSoc goto exit; } else { INFO(NCCL_RAS, "RAS keeping the new socket and terminating the existing one"); - rasSocketTerminate(connSock); + rasSocketTerminate(conn->sock); } } - } - if (!conn) { + } else { // conn == nullptr NCCLCHECK(getNewConnEntry(&conn)); memcpy(&conn->addr, &msg->connInit.listeningAddr, sizeof(conn->addr)); - connIdx = conn - rasConns; } sock->status = RAS_SOCK_READY; // rasConnResume will reset any experiencingDelays, startRetryTime, etc. - conn->sockIdx = sock-rasSockets; - sock->connIdx = connIdx; + conn->sock = sock; + sock->conn = conn; memcpy(&sock->sock.addr, &msg->connInit.listeningAddr, sizeof(sock->sock.addr)); // Make sure that the connection is part of the right links forming the RAS network. At this point we only @@ -456,8 +490,8 @@ static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSoc // Note: it's possible for peerIdx to be -1 at this point if, due to races, the connInit arrives before // the peers update. if (peerIdx != -1) { - (void)rasLinkUpdateConn(&rasNextLink, connIdx, peerIdx); - (void)rasLinkUpdateConn(&rasPrevLink, connIdx, peerIdx); + (void)rasLinkConnUpdate(&rasNextLink, conn, peerIdx); + (void)rasLinkConnUpdate(&rasPrevLink, conn, peerIdx); } // Send a confirmation to the server that requested the connection (so that the resilience code can mark @@ -504,12 +538,13 @@ static ncclResult_t rasMsgHandleConnInitAck(const struct rasMsg* msg, struct ras } // Handles the deadPeer broadcast. -void rasMsgHandleBCDeadPeer(const struct rasCollRequest* req, bool* pDone) { - INFO(NCCL_RAS, "RAS handling deadPeer (addr %s)", ncclSocketToString(&req->deadPeer.addr, rasLine)); +void rasMsgHandleBCDeadPeer(struct rasCollRequest** pReq, size_t* pReqLen, bool* pDone) { + INFO(NCCL_RAS, "RAS handling deadPeer (addr %s)", ncclSocketToString(&(*pReq)->deadPeer.addr, rasLine)); - if (!rasPeerIsDead(&req->deadPeer.addr)) { - rasConnDisconnect(&req->deadPeer.addr); - (void)rasPeerDeclareDead(&req->deadPeer.addr); + *pReqLen = rasCollDataLength(RAS_BC_DEADPEER); + if (!rasPeerIsDead(&(*pReq)->deadPeer.addr)) { + rasConnDisconnect(&(*pReq)->deadPeer.addr); + (void)rasPeerDeclareDead(&(*pReq)->deadPeer.addr); *pDone = false; } else { INFO(NCCL_RAS, "RAS already knew it was dead"); @@ -530,6 +565,7 @@ static ncclResult_t rasNetSendNack(struct rasSocket* sock) { INFO(NCCL_RAS, "RAS sending NACK to %s", ncclSocketToString(&sock->sock.addr, rasLine)); + memset(&msg, '\0', sizeof(msg)); msg.type = RAS_MSG_CONNINITACK; msg.connInitAck.nack = 1; offset = 0; @@ -557,16 +593,16 @@ static void* rasThreadMain(void*) { INFO(NCCL_RAS, "RAS thread started"); // Initialize the global pollfd with the file descriptors we already have (the pipe and the listening socket). - NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail); + NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, exit); rasPfds[pfd].fd = rasNotificationPipe[0]; rasPfds[pfd].events = POLLIN; - NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail); - NCCLCHECKGOTO(ncclSocketGetFd(&rasNetListeningSocket, &rasNetListeningSocketFd), ret, fail); + NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, exit); + NCCLCHECKGOTO(ncclSocketGetFd(&rasNetListeningSocket, &rasNetListeningSocketFd), ret, exit); rasPfds[pfd].fd = rasNetListeningSocketFd; rasPfds[pfd].events = POLLIN; - NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail); + NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, exit); rasPfds[pfd].fd = rasClientListeningSocket; rasPfds[pfd].events = POLLIN; @@ -595,32 +631,37 @@ static void* rasThreadMain(void*) { if (rasPfds[pollIdx].revents) { nEvents--; if (rasPfds[pollIdx].fd == rasNotificationPipe[0]) { - (void)rasLocalHandle(); + bool terminate = false; + NCCLCHECKGOTO(rasLocalHandle(&terminate), ret, exit); + if (terminate) + goto exit; } else if (rasPfds[pollIdx].fd == rasNetListeningSocketFd) { (void)rasNetAcceptNewSocket(); } else if (rasPfds[pollIdx].fd == rasClientListeningSocket) { (void)rasClientAcceptNewSocket(); } else { // Check if it's one of the RAS sockets. - int sockIdx; - for (sockIdx = 0; sockIdx < nRasSockets; sockIdx++) { - struct rasSocket* sock = rasSockets+sockIdx; - if (sock->status != RAS_SOCK_CLOSED && rasPfds[pollIdx].fd == sock->sock.fd) { - rasSockEventLoop(sockIdx, pollIdx); + struct rasSocket* sock; + for (sock = rasSocketsHead; sock;) { + struct rasSocket* sockNext = sock->next; + if (rasPfds[pollIdx].fd == sock->sock.fd) { + rasSockEventLoop(sock, pollIdx); break; } - } // for (sockIdx) + sock = sockNext; + } // for (sock) - if (sockIdx == nRasSockets) { + if (sock == nullptr) { // Try a client socket instead. - for (int clientIdx = 0; clientIdx < nRasClients; clientIdx++) { - struct rasClient* client = rasClients+clientIdx; - if (client->status != RAS_CLIENT_CLOSED && rasPfds[pollIdx].fd == client->sock) { - rasClientEventLoop(clientIdx, pollIdx); + for (struct rasClient* client = rasClientsHead; client;) { + struct rasClient* clientNext = client->next; + if (rasPfds[pollIdx].fd == client->sock) { + rasClientEventLoop(client, pollIdx); break; } - } // for (clientIdx) - } // if (sockIdx == nRasSockets) + client = clientNext; + } // for (client) + } // if (sock == nullptr) } // dynamic fds } // if (revents) } // for (pollIdx) @@ -636,14 +677,9 @@ static void* rasThreadMain(void*) { rasCollsHandleTimeouts(now, &nextWakeup); } // for (;;) -fail: - WARN("fatal error - RAS thread terminating"); - std::lock_guard lock(rasInitMutex); - (void)close(rasNotificationPipe[1]); - (void)close(rasNotificationPipe[0]); - (void)close(rasClientListeningSocket); - (void)ncclSocketClose(&rasNetListeningSocket); - rasInitialized = false; +exit: + rasThreadCleanup(); + INFO(NCCL_RAS, "RAS thread terminating"); return nullptr; } diff --git a/src/ras/ras_internal.h b/src/ras/ras_internal.h index 715fff4a4..17326c342 100644 --- a/src/ras/ras_internal.h +++ b/src/ras/ras_internal.h @@ -42,6 +42,14 @@ typedef enum { RAS_COLL_COMMS = 1002, // Collect data about all communicators. } rasCollectiveType; +// Unique communicator identifier. commHash by itself is definitely not guaranteed to be unique. +// Combined with the two other hashes, the chance is much better... +// All three fields are used for sorting. +struct rasCommId { + uint64_t commHash; + uint64_t hostHash, pidHash; // These are the hashes of the *first* rank (comm->peerInfo[0]). +}; + // Payload of a collective request message (RAS_MSG_COLLREQ). struct rasCollRequest { union ncclSocketAddress rootAddr; @@ -56,6 +64,10 @@ struct rasCollRequest { struct { } conns; struct { + int nSkipMissingRanksComms; // Number of elements in the array below. + // Communicators for which we do *not* need the missingRanks data in the responses + // (see struct rasCollCommsMissingRank later). + struct rasCommId skipMissingRanksComms[0]; // Variable length, sorted. } comms; }; }; @@ -69,8 +81,8 @@ struct rasCollResponse { int nPeers; int nData; // Size of data in bytes. union ncclSocketAddress peers[0]; // Variable length. - // The peersAddrs array is followed by: - //alignas(int64_t) char data[0]; // Variable length, collective-dependent. + // The peers array is followed by: + // alignas(int64_t) char data[0]; // Variable length, collective-dependent. }; // Describes a peer NCCL process. Every RAS thread keeps an (identical) array of them, one entry for each @@ -80,6 +92,8 @@ struct rasPeerInfo { pid_t pid; uint64_t cudaDevs; // Bitmask. This is for local devices so 64 bits is enough. uint64_t nvmlDevs; // Same, but not affected by CUDA_VISIBLE_DEVICES. + uint64_t hostHash, pidHash; // Taken from ncclComm, but with the commHash subtracted to make it + // communicator-independent. }; // Describes a RAS message. Every message is preceded by a (32-bit) message length. All data in the host @@ -112,7 +126,7 @@ struct rasMsg { int nPeers; int nDeadPeers; struct rasPeerInfo peers[0]; // Variable length. - // The peers array is followed by the following: + // The peers array is followed by: //union ncclSocketAddress deadPeers[0]; // Variable length. } peersUpdate; struct { @@ -218,6 +232,9 @@ struct rasMsgMeta { // Describes an ongoing collective RAS operation (apart from broadcasts, which don't need a response). // For every collective operation, each participating RAS thread will create its own. struct rasCollective { + struct rasCollective* next; + struct rasCollective* prev; + union ncclSocketAddress rootAddr; uint64_t rootId; @@ -227,15 +244,16 @@ struct rasCollective { bool timeoutWarned; int64_t startTime; // For timeout calculations. - int fromConnIdx; // The connection we received the request from. + struct rasConnection* fromConn; // The connection we received the request from. - int* fwdConns; // Indices of the connections we forwarded the request to; replaced by -1 as the responses arrive. + struct rasConnection** fwdConns; // Connections we forwarded the request to; replaced by nullptr's as the + // responses arrive. int nFwdSent; // Count of the above (local process only). int nFwdRecv; // Count of the responses received or timeouts (local process only). int nLegTimeouts; // Collective (from this process and the responses we received). - union ncclSocketAddress* peers; // Collective (from this process and the responses we received). + union ncclSocketAddress* peers; // Collective (from this process and the responses we received). Unsorted. int nPeers; char* data; // Collective (from this process and the responses we received). @@ -261,13 +279,14 @@ struct rasCollConns { struct rasCollComms { int nComms; struct comm { - uint64_t commHash; - int commNRanks; - int nRanks; // number of elements in the array below, *not* in the communicator. + struct rasCommId commId; + int commNRanks; // >= nRanks + nMissingRanks + int nRanks; // Number of elements in the ranks array below, *not* in the communicator. + int nMissingRanks; // Number of elements in the missingRanks array below. struct rank { int commRank; int peerIdx; // Index within rasCollective->peers, *not* rasPeers. - uint64_t collOpCount; + uint64_t collOpCounts[NCCL_NUM_FUNCTIONS]; struct { ncclResult_t initState:4; ncclResult_t asyncError:4; @@ -278,34 +297,47 @@ struct rasCollComms { char cudaDev; char nvmlDev; } ranks[0]; // Variable length. Sorted by commRank. Optimized for 1 GPU/process. - } comms[0]; // Variable length. Sorted by commHash. + // The ranks array is followed by: + // struct rasCollCommsMissingRank missingRanks[0]; // Variable length. Sorted by commRank. + } comms[0]; // Variable length. Sorted by commId. +}; + +// Provides info about missing ranks. An array of these structures can be part of struct rasCollComms above. +// Because the arrays are of variable length, we can't describe them in C. To ensure that adding +// rasCollCommsMissingRank structures doesn't mess up the alignment, we explicitly request one. +struct alignas(struct rasCollComms) rasCollCommsMissingRank { + int commRank; + union ncclSocketAddress addr; + // We don't need pid here as we can look it up in rasPeers via addr. + char cudaDev; + char nvmlDev; }; // Holds data needed to keep track of a connection belonging to a RAS network link (either the primary one // or one of the fallbacks). struct rasLinkConn { + struct rasLinkConn* next; int peerIdx; // Index in the rasPeers array of the peer this entry describes. Could be -1 (an entry initiated // by an as of yet unknown peer -- should be a temporary situation that resolves via peer updates). - int connIdx; // Index in the rasConns array of the connection to the above peer. Could be -1 (a placeholder - // for a connection to be started by the remote peer). + struct rasConnection* conn; // The connection to the above peer. Could be nullptr (a placeholder for a connection + // to be started by the remote peer). bool external; // true if the entry exists only due to an external request (requested by a remote peer, most // likely as part of fault recovery). Such connections are kept as fallbacks even if there's a // valid primary connection, in order to ensure that keep-alive messages are sent. }; // Describes a link that forms the backbone of the RAS network. Links focus on direction (previous/next in -// case of 1-D topology) rather than a particular destination. The are implemented using rasConnections, but +// case of 1-D topology) rather than a particular destination. They are implemented using rasConnections, but // they are persistent through the life of the RAS threads, whereas rasConnections can be terminated if the RAS // network is reconfigured or a peer dies. struct rasLink { int direction; // 1 for nextLink, -1 for prevLink. - // Index 0 is the primary connection; any additional ones are fallbacks (that get created if we are having - // problems with the primary connection). The elements are de-facto ordered (highest-preference ones have - // the lowest indices). + // First element is the primary connection; any additional ones are fallbacks (that get created if we are having + // problems with the primary connection). The highest-preference elements come first; the list is de-facto sorted + // by peerIdx, though peerIdx values can wrap around (given the ring/torus topology) and they can also be -1 + // (the latter are stored at the end). struct rasLinkConn* conns; - int nConns; - int connsSize; // Array size; could be larger than nConns. // Keep track of a timeout in case we did not create a connection during the last peers update (because we expect // the peer on the other side to do so) but that peer failed to initiate. @@ -315,15 +347,15 @@ struct rasLink { // Describes a connection to another peer on the RAS network. It is meant to be more persistent than a volatile // socket (described by the rasSocket structure), which can be affected by transient network issues. struct rasConnection { - bool inUse; + struct rasConnection* next; + struct rasConnection* prev; union ncclSocketAddress addr; - // Index of the current rasSocket in the rasSockets array. Note that multiple rasSocket entries may point back + // Pointer to the current rasSocket. Note that multiple rasSocket entries may point back // to a single entry here, for sockets that are in the process of being terminated and re-established. - // We use indices, not pointers, because the arrays holding these structures can be re-alloced at run time. - // -1 if there is no such socket. - int sockIdx; + // nullptr if there is no such socket. + struct rasSocket* sock; // We keep the rasPeersHash of remote connections to minimize the number of needless exchanges. // There is a subtle difference in the meaning of lastSentPeersHash and lastRecvPeersHash. @@ -371,16 +403,18 @@ typedef enum { // Describes a socket implementing communication between two peers. struct rasSocket { + struct rasSocket* next; + struct rasSocket* prev; + struct ncclSocket sock; rasSocketStatus status; int pfd; // Index in the rasPfds array. - // Index of the corresponding entry in the rasConns array. - // We use indices, not pointers, because the arrays holding these structures can be re-alloced at run time. - // -1 if there is no connection (normal condition on the accept side before the connInit message). - int connIdx; + // Pointer to the corresponding entry in the rasConns array. + // nullptr if there is no connection (a normal condition on the accept side before the connInit message). + struct rasConnection* conn; int64_t createTime; int64_t lastSendTime; @@ -404,7 +438,10 @@ typedef enum { // Describes a RAS client. struct rasClient { - int sock; + struct rasClient* next; + struct rasClient* prev; + + int sock; // File descriptor rasClientStatus status; @@ -420,7 +457,7 @@ struct rasClient { int64_t timeout; // State stored during asynchronous operations such as collectives. - int collIdx; // Index to the onging rasCollective. + struct rasCollective* coll; }; @@ -440,31 +477,33 @@ void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t ms ncclResult_t rasConnSendMsg(struct rasConnection* conn, int* closed, bool* allSent); ncclResult_t rasMsgRecv(struct rasSocket* sock, struct rasMsg** msg, int* closed); ncclResult_t rasMsgHandle(struct rasMsg* msg, struct rasSocket* sock); -void rasMsgHandleBCDeadPeer(const struct rasCollRequest* req, bool* pDone); +void rasMsgHandleBCDeadPeer(struct rasCollRequest** pReq, size_t* pReqLen, bool* pDone); ncclResult_t rasGetNewPollEntry(int* index); // rasnet.cc extern struct rasLink rasNextLink, rasPrevLink; -extern struct rasConnection* rasConns; -extern int nRasConns; -extern struct rasSocket *rasSockets; -extern int nRasSockets; +extern struct rasConnection* rasConnsHead; +extern struct rasConnection* rasConnsTail; +extern struct rasSocket *rasSocketsHead; +extern struct rasSocket *rasSocketsTail; ncclResult_t getNewConnEntry(struct rasConnection** pConn); -ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, int* pConnIdx); -int rasConnFind(const union ncclSocketAddress* addr); +ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, struct rasConnection** pConn); +struct rasConnection* rasConnFind(const union ncclSocketAddress* addr); void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup); void rasConnDisconnect(const union ncclSocketAddress* addr); ncclResult_t rasNetAcceptNewSocket(); void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup); void rasSocketTerminate(struct rasSocket* sock, bool finalize = false, uint64_t startRetryOffset = 0, bool retry = true); -void rasSockEventLoop(int sockIdx, int pollIdx); +void rasSockEventLoop(struct rasSocket* sock, int pollIdx); void rasNetHandleTimeouts(int64_t now, int64_t* nextWakeup); ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* sock); -ncclResult_t rasLinkUpdateConn(struct rasLink* link, int connIdx, int peerIdx, bool external = false, - bool insert = false, bool pretend = false, int* pLinkIdx = nullptr); +ncclResult_t rasLinkAddFallback(struct rasLink* link, const struct rasConnection* conn); +ncclResult_t rasLinkConnUpdate(struct rasLink* link, struct rasConnection* conn, int peerIdx); +void rasNetTerminate(); + // peers.cc extern struct rasPeerInfo* rasPeers; @@ -483,29 +522,35 @@ ncclResult_t rasPeerDeclareDead(const union ncclSocketAddress* addr); bool rasPeerIsDead(const union ncclSocketAddress* addr); int ncclSocketsCompare(const void* p1, const void* p2); bool ncclSocketsSameNode(const union ncclSocketAddress* a1, const union ncclSocketAddress* a2); +void rasPeersTerminate(); // collectives.cc -extern struct rasCollective* rasCollectives; +extern struct rasCollective* rasCollectivesHead; +extern struct rasCollective* rasCollectivesTail; void rasCollReqInit(struct rasCollRequest* req); -ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, bool* pAllDone = nullptr, - int* pCollIdx = nullptr, int fromConnIdx = -1); +ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, bool* pAllDone = nullptr, + struct rasCollective** pColl = nullptr, struct rasConnection* fromConn = nullptr); ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock); ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock); -void rasCollsPurgeConn(int connIdx); +void rasCollsPurgeConn(struct rasConnection* conn); void rasCollFree(struct rasCollective* coll); void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup); +void rasCollectivesTerminate(); + // client_support.cc extern int rasClientListeningSocket; -extern struct rasClient* rasClients; -extern int nRasClients; +extern struct rasClient* rasClientsHead; +extern struct rasClient* rasClientsTail; + ncclResult_t rasClientInitSocket(); ncclResult_t rasClientAcceptNewSocket(); ncclResult_t rasClientResume(struct rasCollective* coll); -void rasClientEventLoop(int clientIdx, int pollIdx); +void rasClientEventLoop(struct rasClient* client, int pollIdx); const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, size_t size); +void rasClientSupportTerminate(); #endif // !NCCL_RAS_CLIENT diff --git a/src/ras/rasnet.cc b/src/ras/rasnet.cc index 441ad192c..43aa042a7 100644 --- a/src/ras/rasnet.cc +++ b/src/ras/rasnet.cc @@ -13,90 +13,106 @@ struct rasLink rasNextLink = {1}, rasPrevLink = {-1}; // Connections on the RAS network. -struct rasConnection* rasConns; -int nRasConns; +struct rasConnection* rasConnsHead; +struct rasConnection* rasConnsTail; // Sockets implementing the RAS network. -struct rasSocket *rasSockets; -int nRasSockets; +struct rasSocket *rasSocketsHead; +struct rasSocket *rasSocketsTail; // Magic file descriptor number when we want poll() to ignore an entry. Anything negative would do, but // I didn't want to use -1 because it has a special meaning for us. #define POLL_FD_IGNORE -2 +static void freeConnEntry(struct rasConnection* conn); static void rasConnOpen(struct rasConnection* conn); static ncclResult_t rasConnPrepare(struct rasConnection* conn); static void rasConnTerminate(struct rasConnection* conn); static ncclResult_t getNewSockEntry(struct rasSocket** pSock); +static void freeSockEntry(struct rasSocket* sock); static ncclResult_t rasLinkHandleNetTimeouts(struct rasLink* link, int64_t now, int64_t* nextWakeup); -static void rasConnHandleNetTimeouts(int connIdx, int64_t now, int64_t* nextWakeup); +static void rasConnHandleNetTimeouts(struct rasConnection* conn, int64_t now, int64_t* nextWakeup); static void rasConnSendKeepAlive(struct rasConnection* conn, bool nack = false); -static ncclResult_t rasLinkAddFallback(struct rasLink* link, int connIdx); static void rasConnResume(struct rasConnection* conn); static void rasLinkSanitizeFallbacks(struct rasLink* link); -static void rasLinkDropConn(struct rasLink* link, int connIdx, int linkIdx = -1); -static int rasLinkFindConn(const struct rasLink* link, int connIdx); +static ncclResult_t rasLinkConnAdd(struct rasLink* link, struct rasConnection* conn, int peerIdx, bool pretend = false, + int* pLinkIdx = nullptr, struct rasLinkConn** pLinkConn = nullptr, + bool insert = true); +static ncclResult_t rasLinkConnAddExternal(struct rasLink* link, struct rasConnection* conn, int peerIdx); +static void rasLinkConnDrop(struct rasLink* link, const struct rasConnection* conn, bool external = false); +static struct rasLinkConn* rasLinkConnFind(const struct rasLink* link, const struct rasConnection* conn, + int* pLinkIdx = nullptr); /////////////////////////////////////////////// // Functions related to the RAS connections. // /////////////////////////////////////////////// -// Allocates an entry in the rasConns array, enlarging the array if necessary. +// Allocates a new entry in the rasConnections list. ncclResult_t getNewConnEntry(struct rasConnection** pConn) { struct rasConnection* conn; - int i; - for (i = 0; i < nRasConns; i++) - if (!rasConns[i].inUse) - break; - if (i == nRasConns) { - NCCLCHECK(ncclRealloc(&rasConns, nRasConns, nRasConns+RAS_INCREMENT)); - nRasConns += RAS_INCREMENT; - } - conn = rasConns+i; - memset(conn, '\0', sizeof(*conn)); - conn->inUse = true; - conn->sockIdx = -1; + NCCLCHECK(ncclCalloc(&conn, 1)); + ncclIntruQueueConstruct(&conn->sendQ); conn->travelTimeMin = INT64_MAX; conn->travelTimeMax = INT64_MIN; + if (rasConnsHead) { + rasConnsTail->next = conn; + conn->prev = rasConnsTail; + rasConnsTail = conn; + } else { + rasConnsHead = rasConnsTail = conn; + } + *pConn = conn; return ncclSuccess; } +// Frees an entry from the rasConns list. +static void freeConnEntry(struct rasConnection* conn) { + if (conn == nullptr) + return; + + if (conn == rasConnsHead) + rasConnsHead = rasConnsHead->next; + if (conn == rasConnsTail) + rasConnsTail = rasConnsTail->prev; + if (conn->prev) + conn->prev->next = conn->next; + if (conn->next) + conn->next->prev = conn->prev; + free(conn); +} + // Creates a new RAS network connection to a remote peer address. -ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, int* pConnIdx) { +ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, struct rasConnection** pConn) { ncclResult_t ret = ncclSuccess; - struct rasConnection* conn = nullptr; + struct rasConnection* conn; // First check if a connection entry for this peer already exists. - int connIdx = rasConnFind(addr); - if (connIdx != -1) { - conn = rasConns+connIdx; - } + conn = rasConnFind(addr); - if (conn && conn->sockIdx != -1) { + if (conn && conn->sock) { // An entry exists and has a socket associated with it -- nothing left for us to do. - if (pConnIdx) - *pConnIdx = connIdx; + if (pConn) + *pConn = conn; goto exit; } - if (!conn) { + if (conn == nullptr) { NCCLCHECKGOTO(getNewConnEntry(&conn), ret, exit); memcpy(&conn->addr, addr, sizeof(conn->addr)); // We are establishing a new connection -- start the timeout. conn->startRetryTime = clockNano(); - connIdx = conn - rasConns; } - if (pConnIdx) - *pConnIdx = connIdx; + if (pConn) + *pConn = conn; rasConnOpen(conn); @@ -107,7 +123,7 @@ ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, int* pConnIdx) { // Opens a connection to a remote peer. static void rasConnOpen(struct rasConnection* conn) { ncclResult_t ret; // Not used. - struct rasSocket* sock; + struct rasSocket* sock = nullptr; bool closeSocketOnFail = false; int ready; @@ -120,10 +136,8 @@ static void rasConnOpen(struct rasConnection* conn) { NCCLCHECKGOTO(rasGetNewPollEntry(&sock->pfd), ret, fail); - // We delay the initialization of sockIdx, connIdx and status until this point so that in case of failures - // we don't need to clean them up. - conn->sockIdx = sock-rasSockets; - sock->connIdx = conn-rasConns; + conn->sock = sock; + sock->conn = conn; rasPfds[sock->pfd].fd = sock->sock.fd; // We ignore the possibly ready status of the socket at this point and consider it CONNECTING because @@ -141,6 +155,7 @@ static void rasConnOpen(struct rasConnection* conn) { fail: if (closeSocketOnFail) (void)ncclSocketClose(&sock->sock); + freeSockEntry(sock); goto exit; } @@ -166,16 +181,13 @@ static ncclResult_t rasConnPrepare(struct rasConnection* conn) { } // Searches through rasConns for a connection with a provided address. -int rasConnFind(const union ncclSocketAddress* addr) { - // rasConns is not sorted (given the number of indices, it would be a massive hassle to keep it that way) - // so binary search won't do... - for (int i = 0; i < nRasConns; i++) { - struct rasConnection* conn = rasConns+i; - if (conn->inUse && memcmp(&conn->addr, addr, sizeof(conn->addr)) == 0) - return i; +struct rasConnection* rasConnFind(const union ncclSocketAddress* addr) { + for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) { + if (memcmp(&conn->addr, addr, sizeof(conn->addr)) == 0) + return conn; } - return -1; + return nullptr; } // Handles any connection-related timeouts. Many timeouts affect the underlying sockets and thus have been handled @@ -184,58 +196,56 @@ int rasConnFind(const union ncclSocketAddress* addr) { // This is also where we declare peers as dead, etc. // Invoked from the main RAS event loop. void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup) { - for (int connIdx = 0; connIdx < nRasConns; connIdx++) { - struct rasConnection* conn = rasConns+connIdx; - - if (!conn->inUse) - continue; - - if (conn->sockIdx != -1) { - struct rasSocket* sock = rasSockets+conn->sockIdx; + for (struct rasConnection* conn = rasConnsHead; conn;) { + struct rasConnection* connNext = conn->next; + if (conn->sock) { bool sockTerminated = false; // Retry the socket connections that have been refused. - if (sock->status == RAS_SOCK_CONNECTING && sock->sock.state == ncclSocketStateConnecting) { - if (now - sock->lastSendTime > RAS_CONNECT_RETRY) { + if (conn->sock->status == RAS_SOCK_CONNECTING && conn->sock->sock.state == ncclSocketStateConnecting) { + if (now - conn->sock->lastSendTime > RAS_CONNECT_RETRY) { int ready; - if (ncclSocketReady(&sock->sock, &ready) != ncclSuccess) { + if (ncclSocketReady(&conn->sock->sock, &ready) != ncclSuccess) { INFO(NCCL_RAS, "Unexpected error from ncclSocketReady; terminating the socket connection with %s", - ncclSocketToString(&sock->sock.addr, rasLine)); - rasSocketTerminate(sock, /*finalize*/true); + ncclSocketToString(&conn->addr, rasLine)); + rasSocketTerminate(conn->sock, /*finalize*/true); // We will retry below in the same loop. sockTerminated = true; } else { // We update lastSendTime even if !ready because we need it up-to-date for timeout calculations. - sock->lastSendTime = clockNano(); - if (!ready && sock->sock.state == ncclSocketStateConnecting) - *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_CONNECT_RETRY); + conn->sock->lastSendTime = clockNano(); + if (!ready && conn->sock->sock.state == ncclSocketStateConnecting) + *nextWakeup = std::min(*nextWakeup, conn->sock->lastSendTime+RAS_CONNECT_RETRY); else - rasPfds[sock->pfd].fd = sock->sock.fd; // Enable the handling via the main loop. + rasPfds[conn->sock->pfd].fd = conn->sock->sock.fd; // Enable the handling via the main loop. } // if (ncclSocketReady) } else { - *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_CONNECT_RETRY); + *nextWakeup = std::min(*nextWakeup, conn->sock->lastSendTime+RAS_CONNECT_RETRY); } - } // if (sock->status == RAS_SOCK_CONNECTING && sock->sock.state == ncclSocketStateConnecting) + } // if (conn->sock->status == RAS_SOCK_CONNECTING && conn->sock->sock.state == ncclSocketStateConnecting) // For connections that have data to send but that we've been unable to send a message on for a while, // consider their sockets lost and terminate them. - if (!sockTerminated && !ncclIntruQueueEmpty(&conn->sendQ) && sock->status == RAS_SOCK_READY) { - if (now - std::max(sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime) > RAS_STUCK_TIMEOUT) { + if (!sockTerminated && !ncclIntruQueueEmpty(&conn->sendQ) && conn->sock->status == RAS_SOCK_READY) { + if (now - std::max(conn->sock->lastSendTime, + ncclIntruQueueHead(&conn->sendQ)->enqueueTime) > RAS_STUCK_TIMEOUT) { INFO(NCCL_RAS, "RAS send stuck timeout error (%lds) on socket connection with %s", - (now - std::max(sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime)) / - CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine)); - rasSocketTerminate(sock, /*finalize*/false, RAS_STUCK_TIMEOUT); + (now - std::max(conn->sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime)) / + CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine)); + rasSocketTerminate(conn->sock, /*finalize*/false, RAS_STUCK_TIMEOUT); // We will retry below in the same loop. } else { - *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime, - ncclIntruQueueHead(&conn->sendQ)->enqueueTime)+RAS_STUCK_TIMEOUT); + *nextWakeup = std::min(*nextWakeup, + std::max(conn->sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime)+ + RAS_STUCK_TIMEOUT); } - } // if (!ncclIntruQueueEmpty(&conn->sendQ) && sock->status == RAS_SOCK_READY) - } // if (conn->sockIdx != -1) + } // if (!ncclIntruQueueEmpty(&conn->sendQ) && conn->sock->status == RAS_SOCK_READY) + } // if (conn->sock) // For connections that are being (re-)established, irrespective of whether there's a valid socket associated - // with them (conn->startIdx != -1), we need to check if any connection-level timeout has expired. + // with them, we need to check if any connection-level timeout has expired. if (conn->startRetryTime) { + bool connTerminated = false; // If we've been trying to open a connection for too long (60s), give up and mark the peer as dead // so that we don't try again. if (now - conn->startRetryTime > RAS_PEER_DEAD_TIMEOUT) { @@ -248,82 +258,83 @@ void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup) { rasCollReqInit(&bCast); bCast.type = RAS_BC_DEADPEER; memcpy(&bCast.deadPeer.addr, &conn->addr, sizeof(bCast.deadPeer.addr)); - (void)rasNetSendCollReq(&bCast, rasCollDataLength(RAS_BC_DEADPEER)); + (void)rasNetSendCollReq(&bCast); - continue; + connTerminated = true; } else { *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_PEER_DEAD_TIMEOUT); } // RAS_STUCK_TIMEOUT has already been handled in the socket function (we'll pick it up later via - // the conn->sockIdx == -1 test). - - // We print warnings after the same time as with keep-alive (5s), and we pessimistically immediately try - // to establish fallback connections. - if (now - conn->startRetryTime > RAS_CONNECT_WARN) { - if (!conn->experiencingDelays) { - INFO(NCCL_RAS, "RAS connect timeout warning (%lds) on socket connection with %s", - (now-conn->startRetryTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine)); - - // See if the connection was meant to be a part of a RAS link and if so, try to initiate fallback - // connection(s). At this point, it's mostly just a precaution; we will continue trying to establish - // the primary connection until RAS_PEER_DEAD_TIMEOUT expires. - conn->experiencingDelays = true; - (void)rasLinkAddFallback(&rasNextLink, connIdx); - (void)rasLinkAddFallback(&rasPrevLink, connIdx); - // rasConns may have been reallocated by the above calls. - conn = rasConns+connIdx; - - // Stop collectives from waiting for a response over it. - rasCollsPurgeConn(connIdx); - } // if (!conn->experiencingDelays) - } else { - *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_CONNECT_WARN); - } + // the conn->sock == nullptr test). + + if (!connTerminated) { + // We print warnings after the same time as with keep-alive (5s), and we pessimistically immediately try + // to establish fallback connections. + if (now - conn->startRetryTime > RAS_CONNECT_WARN) { + if (!conn->experiencingDelays) { + INFO(NCCL_RAS, "RAS connect timeout warning (%lds) on socket connection with %s", + (now-conn->startRetryTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine)); + + // See if the connection was meant to be a part of a RAS link and if so, try to initiate fallback + // connection(s). At this point, it's mostly just a precaution; we will continue trying to establish + // the primary connection until RAS_PEER_DEAD_TIMEOUT expires. + conn->experiencingDelays = true; + (void)rasLinkAddFallback(&rasNextLink, conn); + (void)rasLinkAddFallback(&rasPrevLink, conn); + + // Stop collectives from waiting for a response over it. + rasCollsPurgeConn(conn); + } // if (!conn->experiencingDelays) + } else { + *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_CONNECT_WARN); + } - // If a socket was terminated (or never opened, due to some error), try to open it now. - // We retry once a second. - if (conn->sockIdx == -1) { - if (now - conn->lastRetryTime > RAS_CONNECT_RETRY) { - INFO(NCCL_RAS, "RAS trying to reconnect with %s (experiencingDelays %d, startRetryTime %.2fs)", - ncclSocketToString(&conn->addr, rasLine), conn->experiencingDelays, - (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0)); - rasConnOpen(conn); + // If a socket was terminated (or never opened, due to some error), try to open it now. + // We retry once a second. + if (conn->sock == nullptr) { + if (now - conn->lastRetryTime > RAS_CONNECT_RETRY) { + INFO(NCCL_RAS, "RAS trying to reconnect with %s (experiencingDelays %d, startRetryTime %.2fs)", + ncclSocketToString(&conn->addr, rasLine), conn->experiencingDelays, + (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0)); + rasConnOpen(conn); + } + if (conn->sock == nullptr) + *nextWakeup = std::min(*nextWakeup, conn->lastRetryTime+RAS_CONNECT_RETRY); } - if (conn->sockIdx == -1) - *nextWakeup = std::min(*nextWakeup, conn->lastRetryTime+RAS_CONNECT_RETRY); - } + } // if (!connTerminated) } // if (conn->startRetryTime) - } // for (connIdx) + + conn = connNext; + } // for (conn) } // Checks if we have a connection to a given peer and if so, terminates it. The connection is removed from the // RAS links, though fallbacks are initiated if necessary. Typically called just before declaring a peer dead. void rasConnDisconnect(const union ncclSocketAddress* addr) { - int connIdx = rasConnFind(addr); - if (connIdx != -1) { - (void)rasLinkAddFallback(&rasNextLink, connIdx); - (void)rasLinkAddFallback(&rasPrevLink, connIdx); - rasLinkDropConn(&rasNextLink, connIdx); - rasLinkDropConn(&rasPrevLink, connIdx); - - rasConnTerminate(rasConns+connIdx); + struct rasConnection* conn = rasConnFind(addr); + if (conn) { + (void)rasLinkAddFallback(&rasNextLink, conn); + (void)rasLinkAddFallback(&rasPrevLink, conn); + rasLinkConnDrop(&rasNextLink, conn); + rasLinkConnDrop(&rasPrevLink, conn); + + rasConnTerminate(conn); } } // Terminates a connection and frees the rasConns entry. static void rasConnTerminate(struct rasConnection* conn) { - int connIdx = conn - rasConns; - // Make sure there are no lingering rasSockets pointing to it. - for (int i = 0; i < nRasSockets; i++) { - struct rasSocket* sock = rasSockets+i; - if (sock->status != RAS_SOCK_CLOSED && sock->connIdx == connIdx) + for (struct rasSocket* sock = rasSocketsHead; sock;) { + struct rasSocket* sockNext = sock->next; + if (sock->conn == conn) rasSocketTerminate(sock, /*finalize*/true); + sock = sockNext; } // Also check any ongoing collectives. - rasCollsPurgeConn(connIdx); + rasCollsPurgeConn(conn); while (struct rasMsgMeta* meta = ncclIntruQueueTryDequeue(&conn->sendQ)) { free(meta); @@ -331,8 +342,7 @@ static void rasConnTerminate(struct rasConnection* conn) { INFO(NCCL_RAS, "RAS terminating a connection with %s", ncclSocketToString(&conn->addr, rasLine)); - conn->inUse = false; - conn->sockIdx = -1; // Should be that way already, but just to be extra sure... + freeConnEntry(conn); } @@ -344,7 +354,7 @@ static void rasConnTerminate(struct rasConnection* conn) { // corresponding rasConnection can't be established without knowing the peer's address. ncclResult_t rasNetAcceptNewSocket() { ncclResult_t ret = ncclSuccess; - struct rasSocket* sock; + struct rasSocket* sock = nullptr; int ready; bool socketInitialized = false; NCCLCHECKGOTO(getNewSockEntry(&sock), ret, fail); @@ -370,91 +380,98 @@ ncclResult_t rasNetAcceptNewSocket() { fail: if (socketInitialized) NCCLCHECK(ncclSocketClose(&sock->sock)); + freeSockEntry(sock); goto exit; } -// Returns the index of the first available entry in the rasConns array, enlarging the array if necessary. +// Allocates a new entry in the rasSockets list. static ncclResult_t getNewSockEntry(struct rasSocket** pSock) { struct rasSocket* sock; - int i; - for (i = 0; i < nRasSockets; i++) - if (rasSockets[i].status == RAS_SOCK_CLOSED) - break; - if (i == nRasSockets) { - NCCLCHECK(ncclRealloc(&rasSockets, nRasSockets, nRasSockets+RAS_INCREMENT)); - nRasSockets += RAS_INCREMENT; - } - sock = rasSockets+i; - memset(sock, '\0', sizeof(*sock)); + NCCLCHECK(ncclCalloc(&sock, 1)); + sock->pfd = -1; - sock->connIdx = -1; sock->createTime = sock->lastSendTime = sock->lastRecvTime = clockNano(); + if (rasSocketsHead) { + rasSocketsTail->next = sock; + sock->prev = rasSocketsTail; + rasSocketsTail = sock; + } else { + rasSocketsHead = rasSocketsTail = sock; + } + *pSock = sock; return ncclSuccess; } +// Frees an entry from the rasSockets list. +static void freeSockEntry(struct rasSocket* sock) { + if (sock == nullptr) + return; + + if (sock == rasSocketsHead) + rasSocketsHead = rasSocketsHead->next; + if (sock == rasSocketsTail) + rasSocketsTail = rasSocketsTail->prev; + if (sock->prev) + sock->prev->next = sock->next; + if (sock->next) + sock->next->prev = sock->prev; + free(sock); +} + // Invoked from the main RAS event loop to handle RAS socket timeouts. void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup) { - for (int sockIdx = 0; sockIdx < nRasSockets; sockIdx++) { - struct rasSocket* sock = rasSockets+sockIdx; + for (struct rasSocket* sock = rasSocketsHead; sock;) { + struct rasSocket* sockNext = sock->next; - if (sock->status == RAS_SOCK_CLOSED) - continue; - - // For socket connections that are still being established, give up on the ones that take too long to initialize. if (sock->status == RAS_SOCK_CONNECTING || sock->status == RAS_SOCK_HANDSHAKE) { + // For socket connections that are still being established, give up on the ones that take too long to initialize. if (now - sock->createTime > RAS_STUCK_TIMEOUT) { - if (sock->connIdx == -1) { + if (sock->conn == nullptr) { INFO(NCCL_RAS, "RAS init timeout error (%lds) on incoming socket connection from %s", (now-sock->createTime)/CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine)); } else { - struct rasConnection* conn = rasConns+sock->connIdx; INFO(NCCL_RAS, "RAS init timeout error (%lds) on socket connection with %s " "(experiencingDelays %d, startRetryTime %.2fs, socket status %d)", (now-sock->createTime)/CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine), - conn->experiencingDelays, (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0), - sock->status); + sock->conn->experiencingDelays, + (sock->conn->startRetryTime ? (now-sock->conn->startRetryTime)/1e9 : 0.0), sock->status); } rasSocketTerminate(sock, /*finalize*/true); // We may retry later. - continue; } else { *nextWakeup = std::min(*nextWakeup, sock->createTime+RAS_STUCK_TIMEOUT); } - } // if (sock->status == RAS_SOCK_CONNECTING || sock->status == RAS_SOCK_HANDSHAKE) - - // For sockets that are being terminated, force finalization of the ones that haven't made progress in too long. - if (sock->status == RAS_SOCK_TERMINATING) { + } else if (sock->status == RAS_SOCK_TERMINATING) { + // For sockets that are being terminated, force finalization of the ones that haven't made progress in too long. if (now - std::max(sock->lastSendTime, sock->lastRecvTime) > RAS_STUCK_TIMEOUT) { INFO(NCCL_RAS, "RAS termination stuck timeout error (%lds) on socket connection with %s", (now-std::max(sock->lastSendTime, sock->lastRecvTime)) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine)); rasSocketTerminate(sock, /*finalize*/true); // This socket is presumably already being re-established, if needed. - continue; } else { *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime, sock->lastRecvTime)+RAS_STUCK_TIMEOUT); } - } // if (sock->status == RAS_SOCK_TERMINATING) - - // Terminate sockets that haven't been used in a good while. In principle this shouldn't trigger for anything - // important due to shorter timeouts on RAS network connections, but in case of weird situations like process - // suspend, rasSocketTerminate will do additional checking. - if (sock->status == RAS_SOCK_READY) { + } else if (sock->status == RAS_SOCK_READY) { + // Terminate sockets that haven't been used in a good while. In principle this shouldn't trigger for anything + // important due to shorter timeouts on RAS network connections, but in case of weird situations like process + // suspend, rasSocketTerminate will do additional checking. if (now - std::max(sock->lastSendTime, sock->lastRecvTime) > RAS_IDLE_TIMEOUT) { INFO(NCCL_RAS, "RAS idle timeout (%lds) on socket connection with %s", (now - std::max(sock->lastSendTime, sock->lastRecvTime)) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine)); rasSocketTerminate(sock, /*finalize*/false, /*startRetryOffset*/0, /*retry*/false); - continue; // The RAS network timeout handler will terminate the conn it was associated with, if any. } else { *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime, sock->lastRecvTime)+RAS_IDLE_TIMEOUT); } } // if (sock->status == RAS_SOCK_READY) - } // for (sockIdx) + + sock = sockNext; + } // for (sock) } // Handles the termination of a RAS socket. @@ -464,19 +481,19 @@ void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup) { // For not fully established sockets, we can terminate immediately as there's no useful data to extract. void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRetryOffset, bool retry) { assert(sock->status != RAS_SOCK_CLOSED); - if (sock->connIdx != -1) { - struct rasConnection* conn = rasConns+sock->connIdx; - // If the sockIdx of the connection points back to us, it means that we are the current socket of this + if (sock->conn) { + struct rasConnection* conn = sock->conn; + // If the sock of the connection points back to us, it means that we are the current socket of this // connection, so we have additional work to do before we can terminate it. - if (conn->sockIdx == sock-rasSockets) { + if (conn->sock == sock) { // Reset it to indicate there's no valid socket associated with that connection anymore. - conn->sockIdx = -1; + conn->sock = nullptr; // Don't attempt to retry on sockets that have been unused for so long that the remote peer probably // deliberately closed them. Make an exception for sockets that are part of the RAS network links. if ((retry && clockNano() - std::max(sock->lastSendTime, sock->lastRecvTime) < RAS_IDLE_TIMEOUT - RAS_IDLE_GRACE_PERIOD) || - rasLinkFindConn(&rasNextLink, sock->connIdx) != -1 || rasLinkFindConn(&rasPrevLink, sock->connIdx) != -1) { + rasLinkConnFind(&rasNextLink, sock->conn) || rasLinkConnFind(&rasPrevLink, sock->conn)) { // For connections that were fine until now, the connection-level timeout starts at termination, and possibly // even earlier, depending on what event trigerred the termination -- if it was another timeout expiring, then // we need to include that timeout as well. @@ -507,11 +524,11 @@ void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRet } // if (retry) // Stop collectives from waiting for a response over this connection. - rasCollsPurgeConn(sock->connIdx); - } // if (conn->sockIdx == sock-rasSockets) - } // if (sock->connIdx != -1) + rasCollsPurgeConn(sock->conn); + } // if (conn->sock == sock) + } // if (sock->conn) - if (sock->status != RAS_SOCK_CONNECTING && sock->connIdx != -1 && !finalize && (rasPfds[sock->pfd].events & POLLIN)) { + if (sock->status != RAS_SOCK_CONNECTING && sock->conn && !finalize && (rasPfds[sock->pfd].events & POLLIN)) { if (sock->status != RAS_SOCK_TERMINATING) { // The receiving side is still open -- close just the sending side. (void)ncclSocketShutdown(&sock->sock, SHUT_WR); @@ -525,20 +542,15 @@ void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRet } else { // Either the caller requested finalization or we cannot receive on it. (void)ncclSocketClose(&sock->sock); - sock->status = RAS_SOCK_CLOSED; rasPfds[sock->pfd].fd = -1; rasPfds[sock->pfd].events = rasPfds[sock->pfd].revents = 0; - sock->pfd = sock->connIdx = -1; - sock->recvOffset = sock->recvLength = 0; free(sock->recvMsg); - sock->recvMsg = nullptr; + freeSockEntry(sock); } } // Handles a ready socket FD from the main event loop. -void rasSockEventLoop(int sockIdx, int pollIdx) { - struct rasSocket* sock = rasSockets+sockIdx; - +void rasSockEventLoop(struct rasSocket* sock, int pollIdx) { if (sock->status == RAS_SOCK_CONNECTING) { int ready; // Socket is not yet fully established. Continue the OS or NCCL-level handshake. @@ -554,15 +566,15 @@ void rasSockEventLoop(int sockIdx, int pollIdx) { (connectSide ? sock->lastSendTime : sock->lastRecvTime) = clockNano(); sock->status = RAS_SOCK_HANDSHAKE; if (connectSide) { - assert(sock->connIdx != -1); - if (rasConns[sock->connIdx].sockIdx == sockIdx) { - if (rasConnPrepare(rasConns+sock->connIdx) != ncclSuccess) { + assert(sock->conn); + if (sock->conn->sock == sock) { + if (rasConnPrepare(sock->conn) != ncclSuccess) { INFO(NCCL_RAS, "RAS unexpected error from rasConnPrepare; terminating the socket connection with %s", ncclSocketToString(&sock->sock.addr, rasLine)); rasSocketTerminate(sock); // We may retry further down. } - } else { + } else { // sock->conn->sock != sock // The connection this socket is associated with no longer considers it to be the current one. // This could possibly happen due to a race condition. Simply terminate it. INFO(NCCL_RAS, "RAS connected with %s via a socket that's no longer current!", @@ -581,10 +593,9 @@ void rasSockEventLoop(int sockIdx, int pollIdx) { if (sock->status != RAS_SOCK_TERMINATING && (rasPfds[pollIdx].revents & POLLOUT)) { int closed = 0; bool allSent = false; - assert(sock->connIdx != -1); - struct rasConnection* conn = rasConns+sock->connIdx; - assert(conn->sockIdx == sockIdx); - if (rasConnSendMsg(conn, &closed, &allSent) != ncclSuccess) { + assert(sock->conn); + assert(sock->conn->sock == sock); + if (rasConnSendMsg(sock->conn, &closed, &allSent) != ncclSuccess) { INFO(NCCL_RAS, "RAS unexpected error from rasConnSendMsg; terminating the socket connection with %s", ncclSocketToString(&sock->sock.addr, rasLine)); rasSocketTerminate(sock); @@ -612,9 +623,9 @@ void rasSockEventLoop(int sockIdx, int pollIdx) { // We may retry further down. } else if (closed) { const char* socketType; - if (sock->connIdx == -1) + if (sock->conn == nullptr) socketType = "incoming"; - else if (rasConns[sock->connIdx].sockIdx != sockIdx) + else if (sock->conn->sock != sock) socketType = "old"; else if (sock->status == RAS_SOCK_HANDSHAKE) socketType = "new"; @@ -624,25 +635,21 @@ void rasSockEventLoop(int sockIdx, int pollIdx) { socketType, ncclSocketToString(&sock->sock.addr, rasLine)); rasSocketTerminate(sock, /*finalize*/true); // We may retry further down. - } else { + } else { // !closed sock->lastRecvTime = clockNano(); if (msg) { (void)rasMsgHandle(msg, sock); free(msg); - // Message handlers can terminate a socket in certain cases; we need to check for - // that here so that we don't try to receive from a closed socket. - // No handlers are currently believed to create new sockets but better to be safe than sorry - // and re-init the sock variable. - sock = rasSockets+sockIdx; - if (sock->status == RAS_SOCK_CLOSED) + // Message handlers can terminate a socket in various cases. We re-check rasPfds.events to ensure that + // this hasn't happened here (rasSocketTerminate will reset it when finalizing a socket). + if (!(rasPfds[pollIdx].revents & POLLIN)) break; } - if (sock->connIdx != -1) { - struct rasConnection* conn = rasConns+sock->connIdx; - if (conn->sockIdx == sockIdx && (conn->startRetryTime || conn->experiencingDelays)) - rasConnResume(conn); + if (sock->conn) { + if (sock->conn->sock == sock && (sock->conn->startRetryTime || sock->conn->experiencingDelays)) + rasConnResume(sock->conn); } - } + } // !closed } while (msg); } // if (POLLIN) } // RAS_SOCK_HANDSHAKE || RAS_SOCK_READY || RAS_SOCK_TERMINATING @@ -658,109 +665,95 @@ void rasNetHandleTimeouts(int64_t now, int64_t* nextWakeup) { // A connection can belong to multiple links but, when it comes to various timeouts, we want to handle each // connection just once. We solve that with a simple flag within a connection. This also allows us to distinguish // connections that are part of a link from those that are not. - for (int connIdx = 0; connIdx < nRasConns; connIdx++) - rasConns[connIdx].linkFlag = false; + for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) + conn->linkFlag = false; (void)rasLinkHandleNetTimeouts(&rasNextLink, now, nextWakeup); (void)rasLinkHandleNetTimeouts(&rasPrevLink, now, nextWakeup); - for (int connIdx = 0; connIdx < nRasConns; connIdx++) { - struct rasConnection* conn = rasConns+connIdx; - if (conn->inUse && !conn->linkFlag) { + for (struct rasConnection* conn = rasConnsHead; conn;) { + struct rasConnection* connNext = conn->next; + if (!conn->linkFlag) { // The connection is not part of any link. Check if it should be terminated. - if (conn->sockIdx == -1 && ncclIntruQueueEmpty(&conn->sendQ)) { + if (conn->sock == nullptr && ncclIntruQueueEmpty(&conn->sendQ)) rasConnTerminate(conn); - continue; - } } + conn = connNext; } } // Checks for and handles timeouts at the link level; primarily the keep-alives for link connections. static ncclResult_t rasLinkHandleNetTimeouts(struct rasLink* link, int64_t now, int64_t* nextWakeup) { - for (int i = 0; i < link->nConns; i++) { - struct rasLinkConn* linkConn = link->conns+i; - if (linkConn->connIdx != -1) { - if (!rasConns[linkConn->connIdx].linkFlag) { - rasConnHandleNetTimeouts(linkConn->connIdx, now, nextWakeup); - // rasConns may have been reallocated by the above call, which is why we don't have a conn variable here. - // For the same reason we re-init linkConn. - linkConn = link->conns+i; - rasConns[linkConn->connIdx].linkFlag = true; + for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next) { + if (linkConn->conn) { + if (!linkConn->conn->linkFlag) { + rasConnHandleNetTimeouts(linkConn->conn, now, nextWakeup); + linkConn->conn->linkFlag = true; } - } else if (i == 0 && link->lastUpdatePeersTime != 0) { + } else if (linkConn == link->conns && link->lastUpdatePeersTime != 0) { // This triggers when rasLinkReinitConns didn't create the primary connection because we have a higher address // than the peer. If that peer fails to initiate within RAS_CONNECT_WARN, we need to take action. if (now - link->lastUpdatePeersTime > RAS_CONNECT_WARN) { INFO(NCCL_RAS, "RAS peer connect timeout warning (%lds) on socket connection from %s", (now-link->lastUpdatePeersTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine)); - NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->connIdx)); - if (linkConn->connIdx != -1) { - rasConns[linkConn->connIdx].linkFlag = true; + NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->conn)); + if (linkConn->conn) { + linkConn->conn->linkFlag = true; } - // We used to connect to the first fallback but I think trying to connect to the calculated primary first - // in this case is more intuitive. - //(void)rasLinkTryFallback(link, -1); link->lastUpdatePeersTime = 0; } else { *nextWakeup = std::min(*nextWakeup, link->lastUpdatePeersTime+RAS_CONNECT_WARN); } - } // if (i == 0 && link->lastUpdatePeerTime != 0) - } // for (i) + } // if (linkConn == link->conns && link->lastUpdatePeerTime != 0) + } // for (linkConn) return ncclSuccess; } // Handles the sending of keep-alive messages and related timeouts for connections that are part of the RAS links. -static void rasConnHandleNetTimeouts(int connIdx, int64_t now, int64_t* nextWakeup) { - struct rasConnection* conn = rasConns+connIdx; - if (conn->sockIdx != -1) { - struct rasSocket* sock = rasSockets+conn->sockIdx; - - if (sock->status == RAS_SOCK_READY) { +static void rasConnHandleNetTimeouts(struct rasConnection* conn, int64_t now, int64_t* nextWakeup) { + if (conn->sock) { + if (conn->sock->status == RAS_SOCK_READY) { // Send a regular keep-alive message if we haven't sent anything in a while and we don't have anything queued. if (ncclIntruQueueEmpty(&conn->sendQ)) { - if (now - sock->lastSendTime > RAS_KEEPALIVE_INTERVAL) { + if (now - conn->sock->lastSendTime > RAS_KEEPALIVE_INTERVAL) { rasConnSendKeepAlive(conn); } else { - *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_KEEPALIVE_INTERVAL); + *nextWakeup = std::min(*nextWakeup, conn->sock->lastSendTime+RAS_KEEPALIVE_INTERVAL); } } // For short timeouts print a warning but also pessimistically immediately try to establish fallback connections. - if (now - sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_WARN) { + if (now - conn->sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_WARN) { if (!conn->experiencingDelays) { INFO(NCCL_RAS, "RAS keep-alive timeout warning (%lds) on socket connection with %s", - (now-sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine)); + (now-conn->sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine)); // At this point, it's mostly just a precaution; we will continue with the primary connection until // RAS_PEER_DEAD_TIMEOUT expires. conn->experiencingDelays = true; - (void)rasLinkAddFallback(&rasNextLink, connIdx); - (void)rasLinkAddFallback(&rasPrevLink, connIdx); - // rasConns and rasSockets may have been reallocated by the above calls. - conn = rasConns+connIdx; - sock = rasSockets+conn->sockIdx; - - // Stop collectives from waiting for a response over it. - rasCollsPurgeConn(connIdx); + (void)rasLinkAddFallback(&rasNextLink, conn); + (void)rasLinkAddFallback(&rasPrevLink, conn); + + // Stop ongoing collectives from waiting for a response over this connection. + rasCollsPurgeConn(conn); } } else { - *nextWakeup = std::min(*nextWakeup, sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_WARN); + *nextWakeup = std::min(*nextWakeup, conn->sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_WARN); } // For long timeouts we need to act. - if (now - sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_ERROR) { + if (now - conn->sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_ERROR) { INFO(NCCL_RAS, "RAS keep-alive timeout error (%lds) on socket connection with %s", - (now-sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine)); - rasSocketTerminate(sock, /*finalize*/true, RAS_KEEPALIVE_TIMEOUT_ERROR); + (now-conn->sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine)); + rasSocketTerminate(conn->sock, /*finalize*/true, RAS_KEEPALIVE_TIMEOUT_ERROR); *nextWakeup = now; // Retry will be in the next iteration of the main loop so ensure we don't wait. } else { - *nextWakeup = std::min(*nextWakeup, sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_ERROR); + *nextWakeup = std::min(*nextWakeup, conn->sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_ERROR); } - } // if (sock->status == RAS_SOCK_READY) - } // if (conn->sockIdx != -1) + } // if (conn->sock->status == RAS_SOCK_READY) + } // if (conn->sock) } // Sends a keep-alive message to a peer on the RAS network. @@ -768,17 +761,17 @@ static void rasConnSendKeepAlive(struct rasConnection* conn, bool nack) { struct rasMsg* msg = nullptr; int msgLen = rasMsgLength(RAS_MSG_KEEPALIVE); if (rasMsgAlloc(&msg, msgLen) == ncclSuccess) { - int linkIdx; + struct rasLinkConn* linkConn; msg->type = RAS_MSG_KEEPALIVE; msg->keepAlive.peersHash = rasPeersHash; msg->keepAlive.deadPeersHash = rasDeadPeersHash; msg->keepAlive.nack = (nack ? 1 : 0); - linkIdx = rasLinkFindConn(&rasNextLink, conn-rasConns); - if (linkIdx != -1 && !rasNextLink.conns[linkIdx].external) + linkConn = rasLinkConnFind(&rasNextLink, conn); + if (linkConn && !linkConn->external) msg->keepAlive.linkMask |= 2; // Our rasNextLink should be the peer's rasPrevLink. - linkIdx = rasLinkFindConn(&rasPrevLink, conn-rasConns); - if (linkIdx != -1 && !rasPrevLink.conns[linkIdx].external) + linkConn = rasLinkConnFind(&rasPrevLink, conn); + if (linkConn && !linkConn->external) msg->keepAlive.linkMask |= 1; // Our rasPrevLink should be the peer's rasNextLink. (void)clock_gettime(CLOCK_REALTIME, &msg->keepAlive.realTime); @@ -793,46 +786,51 @@ ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* s int64_t travelTime; int peerIdx; - assert(sock->connIdx != -1); - struct rasConnection* conn = rasConns+sock->connIdx; + assert(sock->conn); SYSCHECK(clock_gettime(CLOCK_REALTIME, ¤tTime), "clock_gettime"); travelTime = (currentTime.tv_sec-msg->keepAlive.realTime.tv_sec)*1000*1000*1000 + (currentTime.tv_nsec-msg->keepAlive.realTime.tv_nsec); - if (msg->keepAlive.peersHash != conn->lastRecvPeersHash) { - conn->lastRecvPeersHash = msg->keepAlive.peersHash; + if (msg->keepAlive.peersHash != sock->conn->lastRecvPeersHash) { + sock->conn->lastRecvPeersHash = msg->keepAlive.peersHash; } - if (msg->keepAlive.deadPeersHash != conn->lastRecvDeadPeersHash) { - conn->lastRecvDeadPeersHash = msg->keepAlive.deadPeersHash; + if (msg->keepAlive.deadPeersHash != sock->conn->lastRecvDeadPeersHash) { + sock->conn->lastRecvDeadPeersHash = msg->keepAlive.deadPeersHash; } // Make sure that the connection is part of the appropriate links forming the RAS network. In particular, this // will add any externally-requested connections to the appropriate links (or remove existing ones, if no longer // needed). - peerIdx = rasPeerFind(&conn->addr); + peerIdx = rasPeerFind(&sock->conn->addr); // Note: it's possible for peerIdx to be -1 at this point if, due to races, the keepAlive arrives before // the peers update. - (void)rasLinkUpdateConn(&rasNextLink, (msg->keepAlive.linkMask & 1) ? sock->connIdx : -1, peerIdx, /*external*/true); - (void)rasLinkUpdateConn(&rasPrevLink, (msg->keepAlive.linkMask & 2) ? sock->connIdx : -1, peerIdx, /*external*/true); + if (msg->keepAlive.linkMask & 1) + (void)rasLinkConnAddExternal(&rasNextLink, sock->conn, peerIdx); + else + rasLinkConnDrop(&rasNextLink, sock->conn, /*external*/true); + if (msg->keepAlive.linkMask & 2) + (void)rasLinkConnAddExternal(&rasPrevLink, sock->conn, peerIdx); + else + rasLinkConnDrop(&rasPrevLink, sock->conn, /*external*/true); // If the keep-alive message is from a peer that doesn't actually need this connection (i.e., for that peer the // connection is just an external fallback), we should check if *we* still need it. It might be that we don't, - // and because we stopped sending the keep-alives, our peer doesn't know about it. rasLinkUpdateConn calls above - // will have wiped any external fallbacks, so anything that remains must be needed. + // and because we stopped sending the keep-alives, our peer doesn't know about it. The rasLinkConnDrop calls + // above will have wiped any external fallbacks, so anything that remains must be needed. if (!msg->keepAlive.nack && msg->keepAlive.linkMask == 0) { - if (rasLinkFindConn(&rasNextLink, sock->connIdx) == -1 && rasLinkFindConn(&rasPrevLink, sock->connIdx) == -1) { + if (rasLinkConnFind(&rasNextLink, sock->conn) == nullptr && rasLinkConnFind(&rasPrevLink, sock->conn) == nullptr) { // We don't need this connection either. Notify the peer about it. To avoid an infinite loop, we set the // special nack flag in the message to distinguish it from regular keep-alives. - rasConnSendKeepAlive(conn, /*nack*/true); + rasConnSendKeepAlive(sock->conn, /*nack*/true); } } - if (conn->travelTimeMin > travelTime) - conn->travelTimeMin = travelTime; - if (conn->travelTimeMax < travelTime) - conn->travelTimeMax = travelTime; - conn->travelTimeSum += travelTime; - conn->travelTimeCount++; + if (sock->conn->travelTimeMin > travelTime) + sock->conn->travelTimeMin = travelTime; + if (sock->conn->travelTimeMax < travelTime) + sock->conn->travelTimeMax = travelTime; + sock->conn->travelTimeSum += travelTime; + sock->conn->travelTimeCount++; if (msg->keepAlive.peersHash != rasPeersHash || msg->keepAlive.deadPeersHash != rasDeadPeersHash) { // This could happen due to a short-lived race condition between the peers propagation @@ -842,7 +840,7 @@ ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* s INFO(NCCL_RAS, "RAS keepAlive hash mismatch from %s (peersHash 0x%lx, deadPeersHash 0x%lx)", ncclSocketToString(&sock->sock.addr, rasLine), msg->keepAlive.peersHash, msg->keepAlive.deadPeersHash); INFO(NCCL_RAS, "RAS my peersHash 0x%lx, deadPeersHash 0x%lx", rasPeersHash, rasDeadPeersHash); - NCCLCHECK(rasConnSendPeersUpdate(conn, rasPeers, nRasPeers)); + NCCLCHECK(rasConnSendPeersUpdate(sock->conn, rasPeers, nRasPeers)); } return ncclSuccess; } @@ -857,100 +855,104 @@ ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* s // External connections are generally ignored by this whole process: in particular, we don't add fallbacks for // timing out external connections. However, we will use an active external connection if it would be a better // option than whatever we can come up with. -static ncclResult_t rasLinkAddFallback(struct rasLink* link, int connIdx) { - int peerIdx = -1; - int linkIdx = -1; +ncclResult_t rasLinkAddFallback(struct rasLink* link, const struct rasConnection* conn) { + struct rasLinkConn* foundLinkConn = nullptr; + struct rasLinkConn* firstExtLinkConn = nullptr; int firstExtLinkIdx = -1; - int newPeerIdx; + int newPeerIdx, i; // First check if the connection is part of this link. In the process also check if any of the link's connections // might be active -- if so, there's no need to initiate any more fallbacks and we can bail out. - for (int i = 0; i < link->nConns; i++) { - struct rasLinkConn* linkConn = link->conns+i; - + i = 0; + for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next, i++) { if (linkConn->peerIdx == -1) { - // Such elements are always at the very end of the array and we can't use them so we can just as well break. + // Such elements are always at the end and we can't use them so we can just as well break. break; } // Check for any other connection that might be a viable fallback (basically, anything that is not experiencing // delays). - if (linkConn->connIdx != -1 && linkConn->connIdx != connIdx) { - struct rasConnection* conn = rasConns+linkConn->connIdx; - if (!conn->experiencingDelays) { - if (!linkConn->external) + if (linkConn->conn && linkConn->conn != conn) { + if (!linkConn->conn->experiencingDelays) { + if (!linkConn->external) { goto exit; // We don't need to do anything if there's a non-external connection. - else if (linkConn->peerIdx != -1) { + } else if (linkConn->peerIdx != -1) { // Record the location of the first potentially viable external connection in the chain; we may prefer it // over anything we can come up with. - if (firstExtLinkIdx == -1) + if (firstExtLinkConn == nullptr) { + firstExtLinkConn = linkConn; firstExtLinkIdx = i; - if (linkIdx != -1) + } + if (foundLinkConn) break; // Break out of the loop if we already have all the data we might need. } // linkConn->external && linkConn->peerIdx != -1 - } // if (!conn->experiencingDelays) - } // if (linkConn->connIdx != -1) + } // if (!linkConn->conn->experiencingDelays) + } // if (linkConn->conn && linkConn->conn != conn) - if (linkConn->connIdx == connIdx) { + if (linkConn->conn == conn) { if (linkConn->external) goto exit; // We don't add fallbacks for external connections... - peerIdx = linkConn->peerIdx; - linkIdx = i; + foundLinkConn = linkConn; // We are not breaking out of the loop here because we want to check for active connections on *all* potentially // viable elements (in particular, there could be some external ones beyond this one). } } - if (linkIdx == -1) + if (foundLinkConn == nullptr) goto exit; // We found an existing element so the connection is part of the link. No existing non-external connections of this // link are active, so a fallback is needed. - assert(peerIdx != -1); - newPeerIdx = rasLinkCalculatePeer(link, peerIdx, /*isFallback*/linkIdx > 0); + assert(foundLinkConn->peerIdx != -1); + newPeerIdx = rasLinkCalculatePeer(link, foundLinkConn->peerIdx, /*isFallback*/(foundLinkConn != link->conns)); // In principle we want to add (at most) one fallback. However, if the found fallback connection already exists // and is also experiencing delays, we need to keep iterating. while (newPeerIdx != -1) { - int newConnIdx = rasConnFind(&rasPeers[newPeerIdx].addr); + struct rasConnection* newConn = rasConnFind(&rasPeers[newPeerIdx].addr); + int linkIdx; + struct rasLinkConn* newLinkConn; // If we previously found a potential external fallback connection, check if it's better than what we just found. - if (firstExtLinkIdx != -1) { + if (firstExtLinkConn) { linkIdx = -1; // Calculate the index that the newly found fallback would have (pretend mode). - NCCLCHECK(rasLinkUpdateConn(link, newConnIdx, newPeerIdx, /*external*/false, /*insert*/true, /*pretend*/true, - &linkIdx)); + NCCLCHECK(rasLinkConnAdd(link, newConn, newPeerIdx, /*pretend*/true, &linkIdx)); assert(linkIdx != -1); if (firstExtLinkIdx < linkIdx) { // The external connection *is* better -- use it as a fallback instead and be done. - link->conns[firstExtLinkIdx].external = false; + firstExtLinkConn->external = false; goto exit; } } - NCCLCHECK(rasLinkUpdateConn(link, newConnIdx, newPeerIdx, /*external*/false, /*insert*/true, /*pretend*/false, - &linkIdx)); - if (firstExtLinkIdx != -1 && linkIdx <= firstExtLinkIdx) - firstExtLinkIdx++; // Adjust if we inserted a new conn at a lower index. + NCCLCHECK(rasLinkConnAdd(link, newConn, newPeerIdx, /*pretend*/false, &linkIdx, &newLinkConn)); + if (firstExtLinkConn && linkIdx <= firstExtLinkIdx) + firstExtLinkIdx++; // Adjust if we inserted a new entry ahead of this one. INFO(NCCL_RAS, "RAS link %d: %s fallback connection %d with %s", - link->direction, (newConnIdx == -1 ? "opening new" : "calculated existing"), + link->direction, (newConn == nullptr ? "opening new" : "calculated existing"), linkIdx, ncclSocketToString(&rasPeers[newPeerIdx].addr, rasLine)); // Note that we don't follow here our convention of "lower address is the one establishing connections" -- // that convention is for optimizing regular operations, but we don't want to take chances during fault // recovery. It may temporarily result in duplicate connections, but we have a mechanism to deal with those. - if (newConnIdx == -1) - NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &link->conns[linkIdx].connIdx)); + if (newConn == nullptr) { + NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &newConn)); + newLinkConn->conn = newConn; + } - struct rasConnection* conn = rasConns+link->conns[linkIdx].connIdx; // If the fallback connection is also experiencing delays, we need to keep trying. - if (!conn->experiencingDelays) + if (!newConn->experiencingDelays) break; INFO(NCCL_RAS, "RAS connection experiencingDelays %d, startRetryTime %.2fs, socket status %d", - conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0), - (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status)); + newConn->experiencingDelays, (newConn->startRetryTime ? (clockNano()-newConn->startRetryTime)/1e9 : 0.0), + (newConn->sock ? newConn->sock->status : -1)); newPeerIdx = rasLinkCalculatePeer(link, newPeerIdx, /*isFallback*/true); } - if (newPeerIdx == -1) - INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (nConns %d)", link->direction, link->nConns); + if (newPeerIdx == -1) { + int nConns = 0; + for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next) + nConns++; + INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (total %d)", link->direction, nConns); + } exit: return ncclSuccess; } @@ -958,7 +960,7 @@ static ncclResult_t rasLinkAddFallback(struct rasLink* link, int connIdx) { // Invoked when we receive a message over a connection that was just activated or was experiencing delays. // Cleans up the fallbacks, timers, etc, as appropriate. static void rasConnResume(struct rasConnection* conn) { - if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY) { + if (conn->sock && conn->sock->status == RAS_SOCK_READY) { INFO(NCCL_RAS, "RAS %s connection with %s (sendQ %sempty, experiencingDelays %d, startRetryTime %.2fs)", (conn->experiencingDelays && conn->startRetryTime == 0 ? "recovered" : "established"), ncclSocketToString(&conn->addr, rasLine), (ncclIntruQueueEmpty(&conn->sendQ) ? "" : "not "), @@ -972,218 +974,362 @@ static void rasConnResume(struct rasConnection* conn) { rasLinkSanitizeFallbacks(&rasPrevLink); if (!ncclIntruQueueEmpty(&conn->sendQ)) - rasPfds[rasSockets[conn->sockIdx].pfd].events |= POLLOUT; + rasPfds[conn->sock->pfd].events |= POLLOUT; } } // Checks if the primary connection is fully established and if so, purges the fallbacks (as they are no longer needed). static void rasLinkSanitizeFallbacks(struct rasLink* link) { - if (link->nConns > 0 && link->conns[0].connIdx != -1) { - struct rasConnection* conn = rasConns+link->conns[0].connIdx; - if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY && !conn->experiencingDelays) { + if (link->conns && link->conns->conn) { + struct rasConnection* conn = link->conns->conn; + if (conn->sock && conn->sock->status == RAS_SOCK_READY && !conn->experiencingDelays) { // We have a good primary. Simply drop all the fallbacks (the external ones will get recreated via the // keepAlive messages). - for (int i = 1; i < link->nConns; i++) { + int i = 1; + for (struct rasLinkConn* linkConn = link->conns->next; linkConn; i++) { + struct rasLinkConn* linkConnNext = linkConn->next; INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s", - link->direction, (link->conns[i].external ? "external " : ""), i, - ncclSocketToString(&rasConns[link->conns[i].connIdx].addr, rasLine)); + link->direction, (linkConn->external ? "external " : ""), i, + ncclSocketToString(&linkConn->conn->addr, rasLine)); + free(linkConn); + linkConn = linkConnNext; } - link->nConns = 1; + link->conns->next = nullptr; link->lastUpdatePeersTime = 0; } } } -// Attempt to drop a connection from a link. -static void rasLinkDropConn(struct rasLink* link, int connIdx, int linkIdx) { - if (linkIdx == -1) - linkIdx = rasLinkFindConn(link, connIdx); - if (linkIdx != -1) { - if (linkIdx == 0) { - INFO(NCCL_RAS, "RAS link %d: dropping primary connection with %s", - link->direction, ncclSocketToString(&rasConns[connIdx].addr, rasLine)); - } else { - INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s", - link->direction, (link->conns[linkIdx].external ? "external " : ""), linkIdx, - ncclSocketToString(&rasConns[connIdx].addr, rasLine)); - } - memmove(link->conns+linkIdx, link->conns+linkIdx+1, (link->nConns-(linkIdx+1))*sizeof(*link->conns)); - if (link->nConns > 1) - link->nConns--; - else { - link->conns[0].peerIdx = link->conns[0].connIdx = -1; +// Adds an entry to a RAS network link (or updates one, if it already exists). +// conn can be nullptr if the connection doesn't exist (yet). +// peerIdx *cannot* be -1 when this function is invoked. +// If pretend is true, the function will not modify the list and will just set *pLinkIdx and *pLinkConn as appropriate. +// pLinkIdx and pLinkConn are (optional) pointers to the results; the index/address of the added/updated entry are +// stored there. +// insert (true by default) determines whether this is an "add" function (as implied by the name) or an "update" -- +// if set to false, it will refuse to add a new entry (but will update an existing one as needed). +// Note: there is some code duplication between this function and rasLinkConnAddExternal so changes to one of them +// may need to be sync'ed to the other one as well. They used to be a single function that could do it all but the +// logic was extremely difficult to follow then. +static ncclResult_t rasLinkConnAdd(struct rasLink* link, struct rasConnection* conn, int peerIdx, bool pretend, + int* pLinkIdx, struct rasLinkConn** pLinkConn, bool insert) { + struct rasLinkConn* oldLinkConn = nullptr; + struct rasLinkConn* linkConnPrev = nullptr; + int i, oldLinkIdx = -1; + + assert(peerIdx != -1); + if (conn) { + // Start by checking if we already have an element with this conn. + oldLinkConn = rasLinkConnFind(link, conn, &oldLinkIdx); + if (oldLinkConn) { + if (pLinkConn) + *pLinkConn = oldLinkConn; + if (oldLinkConn->peerIdx != -1) { + assert(oldLinkConn->peerIdx == peerIdx); + + if (!pretend) + oldLinkConn->external = false; // Ensure that external is cleared. + if (pLinkIdx) + *pLinkIdx = oldLinkIdx; + goto exit; // Nothing more to do if both conn and peerIdx are up to date. + } // if (oldLinkConn->peerIdx != -1) + + // Otherwise oldLinkConn->peerIdx == -1. The oldLinkConn is in a wrong place in the list -- we need to find + // the right spot. This can happen only for external connections. + } // if (oldLinkConn) + } // if (conn) + + // Search for the right spot in the conns list. + i = 0; + for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConnPrev = linkConn, linkConn = linkConn->next, i++) { + if (linkConn->peerIdx == peerIdx) { + // The exact linkConn element already exists. + if (linkConn->conn) + assert(linkConn->conn == conn); + if (!pretend) { + if (linkConn->conn == nullptr) + linkConn->conn = conn; + linkConn->external = false; // Ensure that external is cleared. + if (linkConn == link->conns) { + // We received a connection from the remote peer that matches the primary connection we've been + // waiting for. + rasLinkSanitizeFallbacks(link); + } + } // if (!pretend) + if (pLinkIdx) + *pLinkIdx = i; + if (pLinkConn) + *pLinkConn = linkConn; + goto exit; + } // if (linkConn->peerIdx == peerIdx) + + // Ensure that the previous element is valid. + if (linkConnPrev == nullptr) + continue; + // linkConns with peerIdx == -1 are stored at the end, so if we reach one of them, we are done. + if (linkConn->peerIdx == -1) + break; + // Detect a roll-over and handle it specially. + if (link->direction * (linkConnPrev->peerIdx - linkConn->peerIdx) > 0) { + if (link->direction * (peerIdx - linkConnPrev->peerIdx) > 0 || + link->direction * (peerIdx - linkConn->peerIdx) < 0) + break; + } else { // Regular, monotonic case with the peerIdx value between two existing elements. + if (link->direction * (peerIdx - linkConnPrev->peerIdx) > 0 && + link->direction * (peerIdx - linkConn->peerIdx) < 0) + break; } + } // for (linkConn) - if (linkIdx == 0) { - // First ensure that the conn becoming the primary is not marked as external (we don't want to lose it if - // the remote peer loses interest in it). - link->conns[0].external = false; - if (link->conns[0].connIdx != -1) { - INFO(NCCL_RAS, "RAS link %d: former fallback connection 1 with %s is the new primary", - link->direction, ncclSocketToString(&rasConns[link->conns[0].connIdx].addr, rasLine)); - } - rasLinkSanitizeFallbacks(link); + // The new element should be inserted after linkConnPrev (which is at index i-1). + if (pLinkIdx) + *pLinkIdx = i; + if (pretend) + goto exit; + + if (oldLinkConn) { + if (i != oldLinkIdx) { + // We already have the entry, but we need to move it to a new spot (which must be earlier in the list). + assert(i < oldLinkIdx); + // Remove oldLinkConn from its old spot. + for (struct rasLinkConn* linkConn = linkConnPrev; linkConn->next; linkConn = linkConn->next) { + if (linkConn->next == oldLinkConn) { + linkConn->next = oldLinkConn->next; + break; + } + } // for (linkConn) + // Insert it at its new spot. + oldLinkConn->next = linkConnPrev->next; + linkConnPrev->next = oldLinkConn; + } // if (i != oldLinkIdx) + oldLinkConn->peerIdx = peerIdx; + oldLinkConn->external = false; + } else if (insert) { + struct rasLinkConn* linkConn; + NCCLCHECK(ncclCalloc(&linkConn, 1)); + if (linkConnPrev) { + linkConn->next = linkConnPrev->next; + linkConnPrev->next = linkConn; + } else { + assert(link->conns == nullptr); // We never add an element that would replace an existing primary. + link->conns = linkConn; + // linkConn->next is already nullptr. } - } -} + linkConn->peerIdx = peerIdx; + linkConn->conn = conn; + linkConn->external = false; + if (pLinkConn) + *pLinkConn = linkConn; + } // oldLinkConn == nullptr && insert -// Checks if a given connection is a member of this link and if so, returns its entry index. -// Returns -1 if connection not found. -static int rasLinkFindConn(const struct rasLink* link, int connIdx) { - for (int i = 0; i < link->nConns; i++) { - if (link->conns[i].connIdx == connIdx) - return i; - } - return -1; +exit: + return ncclSuccess; } -// Note: the behavior of this function has become super-complex and so it should be considered for refactoring. -// Searches for and updates an entry in a RAS network link. The conns array is de-facto sorted by peerIdx: it is -// ordered by preference, though peerIdx values can wrap around (given the ring/torus topology) and they can also -// be -1 (the latter are stored at the end). -// external provides an updated value for the entry's external field. A false value, if requested, is always set; -// a true value, however, is only set if a new entry is added (external == true implies insert), i.e., if an entry -// already exists and the function is invoked with external == true, the new value will be ignored. -// If insert is set, it will, if necessary, insert a new entry if one is not already there. -// If pretend is set, it will not modify the array and will just set *pLinkIdx as appropriate. -// pLinkIdx is a pointer to an (optional) result where the index of the added/updated entry is stored. -// -1 can be passed as peerIdx if unknown (possible in case of race conditions, and only if external). -// -1 can be passed as connIdx if unknown or, if insert is *not* set, to indicate that the entry is to be removed -// (the entry's external must match the argument external for it to be removed). -ncclResult_t rasLinkUpdateConn(struct rasLink* link, int connIdx, int peerIdx, bool external, bool insert, - bool pretend, int* pLinkIdx) { +// Adds an external entry in a RAS network link (or updates one, if already exists). +// conn *cannot* be nullptr when this function is invoked. +// peerIdx can be -1 if unknown (possible in case of a race condition between keepAlive and peers update). +// Note: there is some code duplication between this function and rasLinkConnAdd so changes to one of them +// may need to be sync'ed to the other one as well. They used to be a single function that could do it all but the +// logic was extremely difficult to follow then. +static ncclResult_t rasLinkConnAddExternal(struct rasLink* link, struct rasConnection* conn, int peerIdx) { + struct rasLinkConn* oldLinkConn = nullptr; + struct rasLinkConn* linkConnPrev = nullptr; int i, oldLinkIdx = -1; - if (external && connIdx != -1) - insert = true; - - if (connIdx != -1) { - // Start by checking if we already have an element with this connIdx. - oldLinkIdx = rasLinkFindConn(link, connIdx); - if (oldLinkIdx != -1) { - struct rasLinkConn* linkConn = link->conns+oldLinkIdx; - if (linkConn->peerIdx != -1) - assert(linkConn->peerIdx == peerIdx); - - if (linkConn->peerIdx == peerIdx) { - if (!external && !pretend) - linkConn->external = false; // Ensure that external is cleared if so requested. - if (pLinkIdx) - *pLinkIdx = oldLinkIdx; - goto exit; // Nothing more to do if both connIdx and peerIdx are up to date. + assert(conn); + oldLinkConn = rasLinkConnFind(link, conn, &oldLinkIdx); + if (oldLinkConn) { + if (oldLinkConn->peerIdx != -1) + assert(oldLinkConn->peerIdx == peerIdx); + + if (oldLinkConn->peerIdx == peerIdx) + goto exit; // Nothing more to do if both conn and peerIdx are up to date. Note that we neither check nor + // update the value of external here. + + // Otherwise (oldLinkConn->peerIdx == -1 && peerIdx != -1) oldLinkConn, due to its -1 peerIdx, is in + // a wrong place in the array -- we need to find the right spot. oldLinkConn->peerIdx == -1 can only happen for + // external connections. + } // if (oldLinkConn) + + // Search for the right spot in the conns list. + i = 0; + for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConnPrev = linkConn, linkConn = linkConn->next, i++) { + if (peerIdx == -1) { + // We simply want to find the end of the list so that we can insert a new entry with -1 peerIdx there. + continue; + } + if (linkConn->peerIdx == peerIdx) { + // The exact linkConn element already exists. + if (linkConn->conn) + assert(linkConn->conn == conn); + if (linkConn->conn == nullptr) + linkConn->conn = conn; + if (linkConn == link->conns) { + // We received a connection from the remote peer that matches the primary connection we've been + // waiting for. This shouldn't trigger for external connections (rasLinkConnUpdate should be invoked first, + // which will update the entry's conn, so rasLinkConnFind invoked at the top of this function should succeed), + // but better safe than sorry... + rasLinkSanitizeFallbacks(link); } + goto exit; + } // if (linkConn->peerIdx == peerIdx) - // Otherwise (linkConn->peerIdx == -1 && peerIdx != -1) we have a conn that, due to -1 peerIdx, is in a wrong - // place in the array -- we need to find the right spot. linkConn->peerIdx == -1 can only happen for external - // connections. - assert(external); + // Ensure that the previous element is valid. + if (linkConnPrev == nullptr) + continue; + // linkConns with peerIdx == -1 are stored at the end, so if we reach one of them, we are done. + if (linkConn->peerIdx == -1) + break; + // Detect a roll-over and handle it specially. + if (link->direction * (linkConnPrev->peerIdx - linkConn->peerIdx) > 0) { + if (link->direction * (peerIdx - linkConnPrev->peerIdx) > 0 || + link->direction * (peerIdx - linkConn->peerIdx) < 0) + break; + } else { // Regular, monotonic case with the peerIdx value between two existing elements. + if (link->direction * (peerIdx - linkConnPrev->peerIdx) > 0 && + link->direction * (peerIdx - linkConn->peerIdx) < 0) + break; } - } - - if (peerIdx != -1) { - // Search for the right spot in the conns array. - for (i = 0; i < link->nConns; i++) { - struct rasLinkConn* linkConn = link->conns+i; - if (peerIdx != -1 && linkConn->peerIdx == peerIdx) { - // The exact conn element already exists. - if (connIdx == -1 && !insert) { - // Drop the connection from the link. - if (linkConn->external == external) { - if (!pretend) - rasLinkDropConn(link, linkConn->connIdx, i); - else if (pLinkIdx) - *pLinkIdx = i; - } - } else { // connIdx != -1 || insert - if (!pretend) { - if (linkConn->connIdx != -1) - assert(linkConn->connIdx == connIdx); - else - linkConn->connIdx = connIdx; - if (!external) - linkConn->external = false; // Ensure that external is cleared if so requested. - if (i == 0) { - // We received a connection from the remote peer that matches the primary connection we've been - // waiting for. - rasLinkSanitizeFallbacks(link); - } - } // if (!pretend) - if (pLinkIdx) - *pLinkIdx = i; - } // connIdx != -1 || insert + } // for (linkConn) - goto exit; - } // if (peerIdx != -1 && linkConn->peerIdx == peerIdx) - if (!insert) - continue; - // Ensure that the i-1 index is also valid. - if (i == 0) - continue; - // Conns with peerIdx == -1 are stored at the end, so anything else needs to go before them. - if (peerIdx != -1 && linkConn->peerIdx == -1) - break; - // Detect a roll-over and handle it specially. - if (link->direction * (link->conns[i-1].peerIdx - linkConn->peerIdx) > 0) { - if (link->direction * (peerIdx - link->conns[i-1].peerIdx) > 0 || - link->direction * (peerIdx - linkConn->peerIdx) < 0) - break; - } else { // Regular, monotonic case with the peerIdx value between two existing elements. - if (link->direction * (peerIdx - link->conns[i-1].peerIdx) > 0 && - link->direction * (peerIdx - linkConn->peerIdx) < 0) + // The new element should be inserted after linkConnPrev (which is at index i-1). + if (oldLinkConn) { + if (i != oldLinkIdx) { + // We already have the entry, but we need to move it to a new spot (which must be earlier in the list). + assert(i < oldLinkIdx); + INFO(NCCL_RAS, "RAS link %d: moving %sfallback connection with %s from %d to %d", link->direction, + (oldLinkConn->external ? "external " : ""), ncclSocketToString(&conn->addr, rasLine), oldLinkIdx, i); + // Remove oldLinkConn from its old spot. + for (struct rasLinkConn* linkConn = linkConnPrev; linkConn->next; linkConn = linkConn->next) { + if (linkConn->next == oldLinkConn) { + linkConn->next = oldLinkConn->next; break; - } - } // for (i) - } else { - // If peerIdx == -1, insert the new element at the very end. This can only happen for external connections. - assert(external && oldLinkIdx == -1); - i = link->nConns; - } - if (!insert) - goto exit; - - // i holds the index at which to insert a new element. - if (pretend) { - if (pLinkIdx) - *pLinkIdx = i; - goto exit; - } - - if (oldLinkIdx == -1) { + } + } // for (linkConn) + // Insert it at its new spot. + oldLinkConn->next = linkConnPrev->next; + linkConnPrev->next = oldLinkConn; + } // if (i != oldLinkIdx) + oldLinkConn->peerIdx = peerIdx; + oldLinkConn->external = false; + } else { // oldLinkConn == nullptr struct rasLinkConn* linkConn; - if (link->nConns == link->connsSize) { - NCCLCHECK(ncclRealloc(&link->conns, link->connsSize, link->connsSize+RAS_INCREMENT)); - link->connsSize += RAS_INCREMENT; + NCCLCHECK(ncclCalloc(&linkConn, 1)); + if (linkConnPrev) { + INFO(NCCL_RAS, "RAS link %d: adding external fallback connection %d with %s", link->direction, i, + ncclSocketToString(&conn->addr, rasLine)); + linkConn->next = linkConnPrev->next; + linkConnPrev->next = linkConn; + linkConn->external = true; + } else { + INFO(NCCL_RAS, "RAS link %d: adding external fallback with %s as a new primary connection", link->direction, + ncclSocketToString(&conn->addr, rasLine)); + linkConn->next = link->conns; + link->conns = linkConn; + linkConn->external = false; // Primary connections are never external. } - linkConn = link->conns+i; - // Shift existing conns with indices >= i to make room for the new one. - memmove(linkConn+1, linkConn, (link->nConns-i)*sizeof(*link->conns)); linkConn->peerIdx = peerIdx; - linkConn->connIdx = connIdx; - linkConn->external = external; - if (external) { - INFO(NCCL_RAS, "RAS link %d: adding external fallback connection %d with %s", link->direction, i, - ncclSocketToString((connIdx != -1 ? &rasConns[connIdx].addr : &rasPeers[peerIdx].addr), rasLine)); + linkConn->conn = conn; + } // oldLinkConn == nullptr + +exit: + return ncclSuccess; +} + +// Updates an existing entry in a RAS network link, if any. +// Basically an easy-to-use variant of rasLinkConnAdd. +// For this function, conn cannot be a nullptr and peerIdx cannot be -1. +ncclResult_t rasLinkConnUpdate(struct rasLink* link, struct rasConnection* conn, int peerIdx) { + assert(conn && peerIdx != -1); + + NCCLCHECK(rasLinkConnAdd(link, conn, peerIdx, /*pretend*/false, /*pLinkIdx*/nullptr, /*pLinkConn*/nullptr, + /*insert*/false)); + return ncclSuccess; +} + +// Attempts to drop a connection from a link. +// If the optional external argument is true, it will drop a connection only if its external flag is set +// (otherwise the flag is ignored and a connection is always dropped if found). +static void rasLinkConnDrop(struct rasLink* link, const struct rasConnection* conn, bool external) { + struct rasLinkConn* linkConnPrev = nullptr; + int i = 0; + for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConnPrev = linkConn, linkConn = linkConn->next, i++) { + if (linkConn->conn == conn && (!external || linkConn->external)) { + if (linkConnPrev) { + INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s", + link->direction, (linkConn->external ? "external " : ""), i, + ncclSocketToString(&conn->addr, rasLine)); + linkConnPrev->next = linkConn->next; + free(linkConn); + } else { // linkConnPrev == nullptr + INFO(NCCL_RAS, "RAS link %d: dropping primary connection with %s", + link->direction, ncclSocketToString(&conn->addr, rasLine)); + if (linkConn->next) { + link->conns = linkConn->next; + // Ensure that the conn becoming the primary is not marked as external (we don't want to lose it if + // the remote peer loses interest in it). + link->conns->external = false; + if (link->conns->conn) + INFO(NCCL_RAS, "RAS link %d: former fallback connection 1 with %s is the new primary", + link->direction, ncclSocketToString(&link->conns->conn->addr, rasLine)); + rasLinkSanitizeFallbacks(link); + free(linkConn); + } else { // linkConn->next == nullptr + // We prefer the primary entry to always be present, even if empty. + linkConn->peerIdx = -1; + linkConn->conn = nullptr; + } // linkConn->next == nullptr + } // linkConnPrev == nullptr + break; + } // if (linkConn->conn == conn) + } // for (linkConn) +} + +// Checks if a given connection is a member of this link and if so, returns its link entry. +// Optionally returns the position of the connection in the conns list. +// Returns nullptr if connection not found. +static struct rasLinkConn* rasLinkConnFind(const struct rasLink* link, const struct rasConnection* conn, + int* pLinkIdx) { + int i = 0; + for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next, i++) { + if (linkConn->conn == conn) { + if (pLinkIdx) + *pLinkIdx = i; + return linkConn; } - link->nConns++; } - else { // oldLinkIdx > -1 - // We already have the conn, we just need to move it to a new spot. - struct rasLinkConn* linkConn = link->conns+i; - assert(i <= oldLinkIdx); // We can only get here if linkConn->peerIdx == -1 && peerIdx != -1. - if (i != oldLinkIdx) { - struct rasLinkConn tmp; - struct rasLinkConn* linkConnNext = link->conns+i+1; // Just to silence the compiler. - // Move the existing conn from index oldLinkIdx to a (lower) index i, shifting the existing conns - // with indices in the range [i, oldLinkIdx). - memcpy(&tmp, link->conns+oldLinkIdx, sizeof(tmp)); - memmove(linkConnNext, linkConn, (oldLinkIdx-i)*sizeof(*linkConn)); - memcpy(linkConn, &tmp, sizeof(*linkConn)); - } - if (!external) - linkConn->external = false; // Ensure that external is cleared if so requested. - } // oldLinkIdx > -1 if (pLinkIdx) - *pLinkIdx = i; -exit: - return ncclSuccess; + *pLinkIdx = -1; + return nullptr; +} + +// Invoked during RAS termination to release all the allocated resources. +void rasNetTerminate() { + for (struct rasLinkConn* linkConn = rasNextLink.conns; linkConn;) { + struct rasLinkConn* linkConnNext = linkConn->next; + free(linkConn); + linkConn = linkConnNext; + } + for (struct rasLinkConn* linkConn = rasPrevLink.conns; linkConn;) { + struct rasLinkConn* linkConnNext = linkConn->next; + free(linkConn); + linkConn = linkConnNext; + } + rasNextLink.conns = rasPrevLink.conns = nullptr; + rasNextLink.lastUpdatePeersTime = rasPrevLink.lastUpdatePeersTime = 0; + + for (struct rasConnection* conn = rasConnsHead; conn;) { + struct rasConnection* connNext = conn->next; + rasConnTerminate(conn); + conn = connNext; + } + // rasConnsHead and rasConnsTail are taken care of by rasConnTerminate(). + + for (struct rasSocket* sock = rasSocketsHead; sock;) { + struct rasSocket* sockNext = sock->next; + rasSocketTerminate(sock); + sock = sockNext; + } + // rasSocketsHead and rasSocketsTail are taken care of by rasSocketTerminate(). } diff --git a/src/register/register.cc b/src/register/register.cc index 9e8f6eaaf..930367a97 100644 --- a/src/register/register.cc +++ b/src/register/register.cc @@ -92,8 +92,8 @@ static ncclResult_t regCleanup(struct ncclComm* comm, struct ncclReg* reg) { } } if (reg->state & NVLS_REG_COMPLETE) { - if (ncclNvlsDeregBuffer(comm, ®->mcHandle, reg->regAddr, reg->dev, reg->regSize) != ncclSuccess) { - WARN("rank %d deregister NVLS buffer %p dev %d size %ld failed", comm->rank, (void*)reg->regAddr, reg->dev, reg->regSize); + if (ncclNvlsDeregBuffer(comm, ®->mcHandle, reg->regAddr, reg->dev, reg->regUCSize, reg->regMCSize) != ncclSuccess) { + WARN("rank %d deregister NVLS buffer %p dev %d ucsize %ld mcsize %ld failed", comm->rank, (void*)reg->regAddr, reg->dev, reg->regUCSize, reg->regMCSize); } reg->regAddr = (CUdeviceptr)NULL; } diff --git a/src/transport.cc b/src/transport.cc index 5629ce7a2..f98b77a43 100644 --- a/src/transport.cc +++ b/src/transport.cc @@ -11,11 +11,12 @@ #include "timer.h" #include "transport.h" -struct ncclTransport* ncclTransports[NTRANSPORTS] = { +struct ncclTransport* ncclTransports[NTRANSPORTS+1] = { &p2pTransport, &shmTransport, &netTransport, - &collNetTransport + &collNetTransport, + &profilerTransport // Not really used for transport, only to create proxy ops polling on profiler counters. }; template @@ -111,12 +112,14 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* gettimeofday(&timeStart, NULL); timeLast = timeStart; // struct copy bool timeReported = false; + cudaStream_t hostStream, deviceStream; NCCLCHECK(ncclCalloc(&data, maxPeers)); NCCLCHECKGOTO(ncclCalloc(&recvData, maxPeers), ret, fail); NCCLCHECKGOTO(ncclCalloc(&sendData, maxPeers), ret, fail); - NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail); // First time initialization for (int i=1; inRanks; i++) { int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0); @@ -195,7 +198,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* if (ret == ncclSuccess) { conn->connected = 1; /* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */ - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[sendPeer]->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[sendPeer]->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), ret, fail); } else if (ret == ncclInProgress) { allChannelsConnected = false; } @@ -214,7 +217,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* if (ret == ncclSuccess) { conn->connected = 1; /* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */ - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[recvPeer]->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[recvPeer]->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), ret, fail); } else if (ret == ncclInProgress) { allChannelsConnected = false; } @@ -286,8 +289,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* if (sendData) free(sendData); if (recvData) free(recvData); - NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream)); - NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream)); + NCCLCHECK(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent)); + NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false)); + NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false)); return ret; fail: goto exit; diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc index 67180123f..c1ccfcaa8 100644 --- a/src/transport/coll_net.cc +++ b/src/transport/coll_net.cc @@ -103,7 +103,7 @@ struct sendResources { int rank; int nranks; int netDev; - int useGdr; + enum ncclTopoGdrMode useGdr; int useDmaBuf; uint64_t* gdcSync; void* gdrDesc; @@ -124,7 +124,7 @@ struct recvResources { int rank; int nranks; int netDev; - int useGdr; + enum ncclTopoGdrMode useGdr; int useDmaBuf; int needFlush; uint64_t* gdcSync; @@ -143,9 +143,19 @@ static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoG return ncclSuccess; } +// Returns the flags to be used by a call to cuMemGetHandleForAddressRange. +static inline int getHandleForAddressRangeFlags(ncclTopoGdrMode useGdr) { + int flags = 0; +#if CUDA_VERSION >= 12080 + // Force mapping on PCIe on systems with both PCI and C2C attachments. + if (useGdr == ncclTopoGdrModePci) flags = CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE; +#endif + return flags; +} + struct setupReq { int netDev; - int useGdr; + enum ncclTopoGdrMode useGdr; int needFlush; struct ncclCollNetSharedRes* collNet; }; @@ -168,8 +178,8 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph req.collNet = comm->collNetSharedRes; NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); - INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev, - req.useGdr ? "/GDRDMA" : ""); + INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev, + req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : ""); return ncclSuccess; } @@ -192,8 +202,8 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph req.collNet = comm->collNetSharedRes; NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t))); - INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev, - req.useGdr ? "/GDRDMA" : ""); + INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev, + req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : ""); return ncclSuccess; } @@ -454,6 +464,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc } static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + ncclResult_t ret = ncclSuccess; if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; } struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff; static_assert(sizeof(collNetSendConnectInfo) <= sizeof(struct ncclConnect), "Collnet Send Connect info is too big"); @@ -505,16 +516,17 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; struct connectMapMem* mapMem = map->mems+bank; NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size)); - NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); + NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); + int dmabuf_fd = -1; #if CUDA_VERSION >= 11070 /* DMA-BUF support */ if (resources->useGdr && resources->useDmaBuf) { - int dmabuf_fd; - CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); - NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size, - NCCL_PTR_CUDA, 0ULL, dmabuf_fd, - &resources->sendMhandles[NCCL_PROTO_SIMPLE])); + CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr))); + NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size, + NCCL_PTR_CUDA, 0ULL, dmabuf_fd, + &resources->sendMhandles[NCCL_PROTO_SIMPLE]), + ret, fail); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path #endif @@ -525,10 +537,18 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str } *((struct connectMap**)respBuff) = &resources->map; - return ncclSuccess; + +exit: + return ret; +fail: + if (dmabuf_fd != -1) { + (void)close(dmabuf_fd); + } + goto exit; } static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + ncclResult_t ret = ncclSuccess; if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; } struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff; @@ -574,16 +594,17 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; struct connectMapMem* mapMem = map->mems+bank; NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size)); - NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); + NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); + int dmabuf_fd = -1; #if CUDA_VERSION >= 11070 /* DMA-BUF support */ if (resources->useGdr && resources->useDmaBuf) { - int dmabuf_fd; - CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); - NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size, - NCCL_PTR_CUDA, 0ULL, dmabuf_fd, - &resources->mhandles[NCCL_PROTO_SIMPLE])); + CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr))); + NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size, + NCCL_PTR_CUDA, 0ULL, dmabuf_fd, + &resources->mhandles[NCCL_PROTO_SIMPLE]), + ret, fail); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path #endif @@ -600,7 +621,14 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str if (respSize != sizeof(struct connectMap*)) { WARN("recvProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; } *((struct connectMap**)respBuff) = &resources->map; - return ncclSuccess; + +exit: + return ret; +fail: + if (dmabuf_fd != -1) { + (void)close(dmabuf_fd); + } + goto exit; } static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { @@ -737,7 +765,7 @@ static ncclResult_t collNetIallreduce(struct ncclProxyState* proxyState, struct } static ncclResult_t collNetRegIallgather(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytesIn, ssize_t allBeg, ssize_t recvBeg, void *recvMhandle, void **request) { - ncclNetSGE_v9_t recvParts; + ncclNetSGE_t recvParts; ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank; char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]); ssize_t nBytes; @@ -779,7 +807,7 @@ static ncclResult_t collNetRegIallgather(struct ncclProxyState* proxyState, stru } static ncclResult_t collNetIallgather(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t allBeg, ssize_t sendBeg, ssize_t recvBeg, void *sendMhandle, void *recvMhandle, void **request) { - ncclNetSGE_v9_t recvParts; + ncclNetSGE_t recvParts; ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank; char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]); recvParts.mhandle = recvMhandle; @@ -796,7 +824,7 @@ static ncclResult_t collNetIallgather(struct ncclProxyState* proxyState, struct } static ncclResult_t collNetRegIreducescatter(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytesIn, ssize_t allBeg, ssize_t sendBeg, void *sendMhandle, void **request) { - ncclNetSGE_v9_t sendParts; + ncclNetSGE_t sendParts; ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank; char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]); ssize_t nBytes; @@ -835,7 +863,7 @@ static ncclResult_t collNetRegIreducescatter(struct ncclProxyState* proxyState, } static ncclResult_t collNetIreducescatter(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t allBeg, ssize_t sendBeg, ssize_t recvBeg, void *sendMhandle, void *recvMhandle, void **request) { - ncclNetSGE_v9_t sendParts; + ncclNetSGE_t sendParts; ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank; char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]); sendParts.mhandle = sendMhandle; @@ -1150,6 +1178,7 @@ struct collnetRegInfo { static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, struct ncclReg* regRecord, int* outRegBufFlag, void** outHandle) { ncclResult_t ret = ncclSuccess; + int gdrEnable = -1; if (regRecord) { if (regRecord->state & COLLNET_REG_COMPLETE) { // reuse previous registration @@ -1165,6 +1194,7 @@ static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* use if (conn->flags & NCCL_DIRECT_NIC) { struct ncclProxyConnector* proxyconn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn; + gdrEnable = 1; NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail); if (handle) { regRecord->state |= COLLNET_REG_COMPLETE; @@ -1174,7 +1204,8 @@ static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* use INFO(NCCL_REG, "rank %d - COLLNET register userbuff %p (handle %p), buffSize %ld, type %s", comm->rank, userbuff, handle, buffSize, type == collNetRecv ? "Recv" : "Send"); } } else { - WARN("rank %d - COLLNET failed to register userbuff %p (handle %p), buffSize %ld, type %s, GDR is not enabled", comm->rank, userbuff, handle, buffSize, type == collNetRecv ? "Recv" : "Send"); + gdrEnable = 0; + goto fail; } } } @@ -1183,6 +1214,7 @@ static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* use fail: *outRegBufFlag = 0; *outHandle = NULL; + INFO(NCCL_REG, "rank %d - COLLNET failed to register userbuff %p, buffSize %ld, type %s, GDR %d", comm->rank, userbuff, buffSize, type == collNetRecv ? "Recv" : "Send", gdrEnable); goto exit; } @@ -1268,17 +1300,20 @@ static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, s assert(reqSize == sizeof(struct collnetRegInfo)); assert(respSize == sizeof(void*)); + int dmabuf_fd = -1; #if CUDART_VERSION >= 11070 /* DMA-BUF support */ if (resources->useGdr && resources->useDmaBuf) { - int dmabuf_fd; - CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem); + CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem); NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem); - (void)close(dmabuf_fd); needReg = false; } #endif peermem: + if (dmabuf_fd != -1) { + (void)close(dmabuf_fd); + dmabuf_fd = -1; + } if (needReg) { NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail); } @@ -1301,17 +1336,20 @@ static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, s assert(reqSize == sizeof(struct collnetRegInfo)); assert(respSize == sizeof(void*)); + int dmabuf_fd = -1; #if CUDART_VERSION >= 11070 /* DMA-BUF support */ if (resources->useGdr && resources->useDmaBuf) { - int dmabuf_fd; - CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem); + CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem); NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem); - (void)close(dmabuf_fd); needReg = false; } #endif peermem: + if (dmabuf_fd != -1) { + (void)close(dmabuf_fd); + dmabuf_fd = -1; + } if (needReg) { NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail); } @@ -1600,4 +1638,4 @@ struct ncclTransport collNetTransport = { canConnect, { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer }, { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer } -}; \ No newline at end of file +}; diff --git a/src/transport/net.cc b/src/transport/net.cc index 8760b4258..40d334fa7 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -92,7 +92,7 @@ struct sendNetResources { int tpLocalRank; int tpRemoteRank; int netDev; - int useGdr; + enum ncclTopoGdrMode useGdr; int useDmaBuf; int maxRecvs; uint64_t* gdcSync; @@ -123,7 +123,7 @@ struct recvNetResources { int tpRemoteRank; int tpRemoteProxyRank; int netDev; - int useGdr; + enum ncclTopoGdrMode useGdr; int useDmaBuf; int needFlush; int maxRecvs; @@ -168,7 +168,7 @@ struct setupReq { int tpRemoteRank; int shared; int netDev; - int useGdr; + enum ncclTopoGdrMode useGdr; int needFlush; int channelId; int connIndex; @@ -180,6 +180,16 @@ static_assert(sizeof(ncclNetHandle_t) + sizeof(int) <= CONNECT_SIZE, "Not large // Forward declaration static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args); +// Returns the flags to be used by a call to cuMemGetHandleForAddressRange. +static inline int getHandleForAddressRangeFlags(ncclTopoGdrMode useGdr) { + int flags = 0; +#if CUDA_VERSION >= 12080 + // Force mapping on PCIe on systems with both PCI and C2C attachments. + if (useGdr == ncclTopoGdrModePci) flags = CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE; +#endif + return flags; +} + /* Determine if we will use this transport for this peer and return connect * information for this peer */ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { @@ -204,11 +214,14 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); if (proxyRank == myInfo->rank) { - INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev, - req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d%s%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev, + req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "", + req.shared ? "/Shared" : ""); } else { - INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev, - proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d(%d)%s%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev, + proxyRank, + req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "", + req.shared ? "/Shared" : ""); } *((int*)connectInfo) = comm->topParentRanks[proxyRank]; memcpy((uint8_t*)connectInfo + sizeof(ncclNetHandle_t), &req.useGdr, sizeof(int)); @@ -247,18 +260,19 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph req.tpRemoteRank = comm->topParentRanks[peerInfo->rank]; NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t))); memcpy((uint8_t*)connectInfo + sizeof(ncclNetHandle_t), &req.useGdr, sizeof(int)); - INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->nvmlDev, myInfo->rank, myInfo->nvmlDev, comm->ncclNet->name, req.netDev, - req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [receive] via NET/%s/%d%s%s%s", channelId, connIndex, peerInfo->rank, peerInfo->nvmlDev, myInfo->rank, myInfo->nvmlDev, comm->ncclNet->name, req.netDev, + req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "", + req.shared ? "/Shared" : ""); return ncclSuccess; } -static ncclResult_t netMapShm(struct ncclComm *comm, struct connectMapMem* mem) { - NCCLCHECK(ncclShmImportShareableBuffer(comm, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, &mem->attachDesc)); +static ncclResult_t netMapShm(struct ncclComm *comm, struct ncclProxyConnector* proxyConn, struct connectMapMem* mem) { + NCCLCHECK(ncclShmImportShareableBuffer(comm, proxyConn->rank, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, &mem->attachDesc)); return ncclSuccess; } static ncclResult_t netCreateShm(struct ncclProxyState* proxyState, struct connectMapMem* mem) { - NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, mem->size, false, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr)); + NCCLCHECK(ncclShmAllocateShareableBuffer(mem->size, false, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr)); return ncclSuccess; } @@ -292,6 +306,7 @@ static ncclResult_t netDumpMap(struct connectMap* map) { struct netSendConnectArgs { ncclNetHandle_t handle; + int trafficClass; }; struct netRecvConnectArgs { @@ -315,6 +330,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId); netSendConnectArgs args = {0}; memcpy(&args.handle, connectInfo, sizeof(ncclNetHandle_t)); + args.trafficClass = comm->config.trafficClass; NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, &args, sizeof(netSendConnectArgs), sizeof(struct connectMap), opId)); } else { opId = send; @@ -343,7 +359,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne } } } else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) { - if (!map->sameProcess) NCCLCHECK(netMapShm(comm, map->mems + NCCL_NET_MAP_HOSTMEM)); + if (!map->sameProcess) NCCLCHECK(netMapShm(comm, &send->proxyConn, map->mems + NCCL_NET_MAP_HOSTMEM)); if (map->mems[NCCL_NET_MAP_DEVMEM].size) { map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr = NULL; NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.rank, @@ -692,9 +708,11 @@ static ncclResult_t ncclNetGetDeviceHandle(ncclNetDeviceType type, int version, static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources); + ncclNetCommConfig_t commConfig = {0}; if (reqSize != sizeof(netSendConnectArgs)) return ncclInternalError; ncclResult_t ret = ncclSuccess; netSendConnectArgs* req = (netSendConnectArgs*) reqBuff; + commConfig.trafficClass = req->trafficClass == NCCL_CONFIG_UNDEF_INT ? NCCL_NET_TRAFFIC_CLASS_UNDEF : req->trafficClass; NCCLCHECK(ncclNetGetDeviceHandle(resources->netDeviceType, resources->netDeviceVersion, false /*isRecv*/, &resources->netDeviceHandle)); if (resources->shared) { // Shared buffers @@ -714,15 +732,15 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks)); } struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteRank; - if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle); + if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle); resources->netSendComm = comms->sendComm[resources->channelId]; if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++; } else { - ret = proxyState->ncclNet->connect(resources->netDev, req->handle, &resources->netSendComm, &resources->netDeviceHandle); + ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle); } } else { // Connect to remote peer - ret = proxyState->ncclNet->connect(resources->netDev, req->handle, &resources->netSendComm, &resources->netDeviceHandle); + ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle); connection->proxyAppendPtr = &connection->proxyAppend; } @@ -748,7 +766,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p for (int p=0; puseGdr, proxyState->buffSizes[p], buffs[p]); + NCCL_NET_MAP_ADD_POINTER(map, 0, p!= NCCL_PROTO_LL && resources->useGdr ? 1 : 0, proxyState->buffSizes[p], buffs[p]); resources->buffSizes[p] = proxyState->buffSizes[p]; } } else { @@ -765,7 +783,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL]; } - NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); + NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); } NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); @@ -820,7 +838,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST; if (type == NCCL_PTR_CUDA && resources->useDmaBuf) { int dmabuf_fd; - CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); + CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr))); NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p])); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path @@ -904,7 +922,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p for (int p=0; puseGdr, proxyState->buffSizes[p], buffs[p]); + NCCL_NET_MAP_ADD_POINTER(map, 0, resources->useGdr ? 1 : 0, proxyState->buffSizes[p], buffs[p]); resources->buffSizes[p] = proxyState->buffSizes[p]; } } else { @@ -915,14 +933,14 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str proxyState, resources->useGdr, resources->tpLocalRank, 1, 1, proxyState->p2pnChannels, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, NULL)); resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size; - NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); + NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); } NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem); if (proxyState->allocP2pNetLLBuffers) { - NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*resources->useGdr*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]); + NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*devMem*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]); resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL]; } @@ -964,7 +982,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST; if (type == NCCL_PTR_CUDA && resources->useDmaBuf) { int dmabuf_fd; - CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)); + CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr))); NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p])); (void)close(dmabuf_fd); } else // FALL-THROUGH to nv_peermem GDR path @@ -1175,11 +1193,12 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct // Coverity complains about the size here as pointing to an out-of-scope temporary. Which is nonsense, // since size is a plain integer. // coverity[use_invalid:FALSE] - NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, sub->requests+buffSlot)); + NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, sub, sub->requests+buffSlot)); if (sub->requests[buffSlot] != NULL) { TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Isend posted, req %p, buff %p, size %d, proto %d, myRank %d, channelId %d, mhandle %p", sub->transmitted, buffSlot, sub->nsteps, sub->requests[buffSlot], buff, size, p, proxyState->tpRank, sub->channelId, sub->sendMhandle); sub->transSize += size; sub->transmitted += args->sliceSteps; + sub->profilerSteps++; ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpSendTransmitted); ncclProfilerRecordProxyStepEventState(s, args, transmittedStepId, ncclProfilerProxyStepSendWait); args->idle = 0; @@ -1280,6 +1299,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct size_t sizes[NCCL_PROXY_MAX_SUBS]; int tags[NCCL_PROXY_MAX_SUBS]; void* mhandles[NCCL_PROXY_MAX_SUBS]; + void* phandles[NCCL_PROXY_MAX_SUBS]; for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; int postedStepId = sub->posted; @@ -1323,6 +1343,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes; tags[subCount] = resources->tpRemoteRank; mhandles[subCount] = sub->recvMhandle; + phandles[subCount] = sub; subCount++; } } @@ -1332,7 +1353,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct void** requestPtr = subGroup->requests+(step%NCCL_STEPS); bool ignoreCompletion = ncclParamNetOptionalRecvCompletion() && ((args->protocol == NCCL_PROTO_LL128) || (args->protocol == NCCL_PROTO_LL)) && (subCount == 1); if (ignoreCompletion) *requestPtr = (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION; - NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr)); + NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, phandles, requestPtr)); if (*requestPtr) { subGroup->recvRequestsCache[step%NCCL_STEPS] = *requestPtr; subGroup->recvRequestsSubCount = subCount; @@ -1341,6 +1362,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct int postedStepId = sub->posted; TRACE(NCCL_NET, "recvProxy [%ld/%ld/%d] Irecv posted, buff %p, size %ld, myRank %d, channelId %d, mhandle %p", sub->posted, (sub->base + sub->posted) % NCCL_STEPS, sub->nsteps, ptrs[i], sizes[i], proxyState->tpRank, sub->channelId, mhandles[i]); sub->posted += args->sliceSteps; + sub->profilerSteps++; ncclProfilerRecordProxyOpEventState(s+i, args, sub->posted, sub->transSize, ncclProfilerProxyOpRecvPosted); ncclProfilerRecordProxyStepEventState(s+i, args, postedStepId, ncclProfilerProxyStepRecvWait); } @@ -1558,7 +1580,7 @@ static ncclResult_t netRegisterBuffer(ncclComm* comm, const void* userbuff, size return ret; fail: *outRegBufFlag = 0; - WARN("rank %d failed to NET register userbuff %p buffSize %ld GDR flag %d", comm->rank, userbuff, buffSize, gdrFlag); + INFO(NCCL_REG, "rank %d failed to NET register userbuff %p buffSize %ld GDR flag %d", comm->rank, userbuff, buffSize, gdrFlag); goto exit; } @@ -1639,7 +1661,7 @@ static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, s /* DMA-BUF support */ if (resources->useDmaBuf) { int dmabuf_fd; - CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem); + CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem); NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem); (void)close(dmabuf_fd); needReg = false; @@ -1673,7 +1695,7 @@ static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, s /* DMA-BUF support */ if (resources->useDmaBuf) { int dmabuf_fd; - CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem); + CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem); NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem); (void)close(dmabuf_fd); needReg = false; diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index bc54133d3..bfff6e555 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -11,6 +11,7 @@ #include "graph.h" #include "utils.h" #include "param.h" +#include "profiler/net_ib.h" #include #include @@ -85,6 +86,11 @@ struct ncclIbDev ncclIbDevs[MAX_IB_DEVS]; pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER; static int ncclIbRelaxedOrderingEnabled = 0; +#define NCCL_IB_LLSTR(ll) (((ll) == IBV_LINK_LAYER_INFINIBAND) ? "IB" : (((ll) == IBV_LINK_LAYER_ETHERNET) ? "RoCE" : "UNSPECIFIED")) + +#define NCCL_IB_SL_DEFAULT 0 +#define NCCL_IB_TC_DEFAULT 0 + NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", -1); NCCL_PARAM(IbRoutableFlidIbGidIndex, "IB_ROUTABLE_FLID_GID_INDEX", 1); NCCL_PARAM(IbRoceVersionNum, "IB_ROCE_VERSION_NUM", 2); @@ -92,8 +98,8 @@ NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 20); NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7); NCCL_PARAM(IbPkey, "IB_PKEY", 0); NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0); -NCCL_PARAM(IbSl, "IB_SL", 0); -NCCL_PARAM(IbTc, "IB_TC", 0); +NCCL_PARAM(IbSl, "IB_SL", -1); +NCCL_PARAM(IbTc, "IB_TC", -1); NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192); NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2); NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2); @@ -327,6 +333,9 @@ static ncclResult_t ncclIbRoceGetVersionNum(const char* deviceName, int portNum, close(fd); if (ret == -1) { + // In containerized environments, read could return EINVAL if the GID index is not mapped to the + // container sysfs. In this case return ncclSuccess and let the caller move to next GID index. + if (errno == EINVAL) return ncclSuccess; WARN("NET/IB: read failed in ncclIbRoceGetVersionNum: %s", strerror(errno)); return ncclSystemError; } @@ -359,7 +368,7 @@ static ncclResult_t ncclUpdateGidIndex(struct ibv_context* context, uint8_t port return ncclSuccess; } int usrRoceVer = roceVer; - int gidRoceVerNum, gidRoceVerNumCandidate; + int gidRoceVerNum, gidRoceVerNumCandidate = -1; const char* deviceName = wrap_ibv_get_device_name(context->device); NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, *gidIndex, &gidRoceVerNum)); NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, gidIndexCandidate, &gidRoceVerNumCandidate)); @@ -530,8 +539,8 @@ ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props) { } ncclIbDev* dev = ncclIbDevs + props->devs[i]; if (dev->link != dev0->link) { - WARN("NET/IB : Trying to merge multiple devices together with different link_layer properties %s -> %d, %s -> %d. Try only selecting NICs with one type of link using NCCL_IB_HCA", - dev0->devName, dev0->link, dev->devName, dev->link); + WARN("NET/IB : Attempted to merge incompatible devices: [%d]%s:%d/%s and [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA", + props->devs[0], dev0->devName, dev0->portNum, NCCL_IB_LLSTR(dev0->link), props->devs[i], dev->devName, dev->portNum, NCCL_IB_LLSTR(dev->link)); return ncclInvalidUsage; } } @@ -548,8 +557,11 @@ ncclResult_t ncclIbMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { return res; } -ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { +static ncclProfilerCallback_t ncclProfilerFunction; + +ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { ncclResult_t ret = ncclSuccess; + ncclProfilerFunction = profFunction; if (ncclParamIbDisable()) return ncclInternalError; static int shownIbHcaEnv = 0; if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; } @@ -571,7 +583,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { struct ibv_device** devices; // Check if user defined which IB device:port to use - char* userIbEnv = getenv("NCCL_IB_HCA"); + const char* userIbEnv = ncclGetEnv("NCCL_IB_HCA"); if (userIbEnv != NULL && shownIbHcaEnv++ == 0) INFO(NCCL_NET|NCCL_ENV, "NCCL_IB_HCA set to %s", userIbEnv); struct netIf userIfs[MAX_IB_DEVS]; bool searchNot = userIbEnv && userIbEnv[0] == '^'; @@ -634,7 +646,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting(); TRACE(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum, - portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE", ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar); + NCCL_IB_LLSTR(portAttr.link_layer), ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar); PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail); ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs); @@ -666,7 +678,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { ncclIbRelaxedOrderingEnabled = ncclIbRelaxedOrderingCapable(); for (int d = 0; d < ncclNIbDevs; d++) { snprintf(line+strlen(line), sizeof(line)-strlen(line), " [%d]%s:%d/%s", d, ncclIbDevs[d].devName, - ncclIbDevs[d].portNum, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); + ncclIbDevs[d].portNum, NCCL_IB_LLSTR(ncclIbDevs[d].link)); } char addrline[SOCKET_NAME_MAXLEN+1]; INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "", @@ -832,6 +844,8 @@ struct ncclIbConnectionMetadata { char devName[MAX_MERGED_DEV_NAME]; uint64_t fifoAddr; int ndevs; + int tc; + int sl; }; enum ncclIbCommState { @@ -873,12 +887,23 @@ struct ncclIbGidInfo { #define NCCL_NET_IB_REQ_FLUSH 3 const char* reqTypeStr[] = { "Unused", "Send", "Recv", "Flush" }; +#define MAX_QPS_PER_REQ 8 +struct ncclProfilerInfo { + void* qpEventHandles[MAX_QPS_PER_REQ]; + int qpIndex[MAX_QPS_PER_REQ]; + int nEventHandles; + ncclProfilerNetIbDescr_v1_t data; +}; + struct ncclIbRequest { struct ncclIbNetCommBase* base; int type; struct ncclSocket* sock; int events[NCCL_IB_MAX_DEVS_PER_NIC]; struct ncclIbNetCommDevBase* devBases[NCCL_IB_MAX_DEVS_PER_NIC]; +#ifdef NCCL_ENABLE_NET_PROFILING + struct ncclProfilerInfo pInfo[NCCL_NET_IB_MAX_RECVS]; +#endif int nreqs; union { struct { @@ -1084,7 +1109,7 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, return ncclSuccess; } -ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint32_t dest_qp_num, struct ncclIbDevInfo* info, bool fifoTc) { +ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint32_t dest_qp_num, struct ncclIbDevInfo* info, bool fifoTc, int tc, int sl) { struct ibv_qp_attr qpAttr; memset(&qpAttr, 0, sizeof(struct ibv_qp_attr)); qpAttr.qp_state = IBV_QPS_RTR; @@ -1100,7 +1125,7 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint qpAttr.ah_attr.grh.flow_label = 0; qpAttr.ah_attr.grh.sgid_index = sGidInfo->localGidIndex; qpAttr.ah_attr.grh.hop_limit = 255; - qpAttr.ah_attr.grh.traffic_class = fifoTc && ncclParamIbFifoTc() != -1 ? ncclParamIbFifoTc() : ncclParamIbTc(); + qpAttr.ah_attr.grh.traffic_class = fifoTc && ncclParamIbFifoTc() != -1 ? ncclParamIbFifoTc() : tc; } else { //pick lid if subnet prefixs are same, FLID if they are not if (ncclIbExtractLocalSubnetPrefix(sGidInfo->localGid.global.subnet_prefix) == @@ -1122,10 +1147,10 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint qpAttr.ah_attr.grh.hop_limit = 255; } } - qpAttr.ah_attr.sl = ncclParamIbSl(); + qpAttr.ah_attr.sl = sl; qpAttr.ah_attr.src_path_bits = 0; qpAttr.ah_attr.port_num = info->ib_port; - TRACE(NCCL_NET, "NET/IB : ncclIbRtrQp qpn=%u mtu=%d dst=%u ll=%u port=%u", qp->qp_num, info->mtu, dest_qp_num, info->link_layer, info->ib_port); + TRACE(NCCL_NET, "NET/IB : ncclIbRtrQp qpn=%u mtu=%d dst=%u ll=%u port=%u sl: %d tc: %d", qp->qp_num, info->mtu, dest_qp_num, info->link_layer, info->ib_port, qpAttr.ah_attr.sl, qpAttr.ah_attr.grh.traffic_class); NCCLCHECK(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)); return ncclSuccess; } @@ -1164,12 +1189,13 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) { goto exit; } -ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { +ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { ncclResult_t ret = ncclSuccess; struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle; struct ncclIbCommStage* stage = &handle->stage; struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm; int ready; + uint8_t link_layer = IBV_LINK_LAYER_UNSPECIFIED; *sendComm = NULL; if (stage->state == ncclIbCommStateConnect) goto ib_connect_check; @@ -1199,7 +1225,7 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet // IB Setup struct ncclIbMergedDev* mergedDev; if (dev >= ncclNMergedIbDevs) { - WARN("NET/IB : Trying to use non-existant virtual device %d", dev); + WARN("NET/IB : Trying to use non-existent virtual device %d", dev); return ncclInternalError; } @@ -1305,8 +1331,17 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet devInfo->gid.global.subnet_prefix, devInfo->gid.global.interface_id, devInfo->fifoRkey, commDev->fifoMr->lkey); } } + if (link_layer == IBV_LINK_LAYER_UNSPECIFIED) link_layer = devInfo->link_layer; + if (link_layer != devInfo->link_layer) { + int ibDev0 = comm->devs[0].base.ibDevN; + WARN("NET/IB : Attempted to connect incompatible devices: [%d]%s:%d/%s and [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA", + commDev->base.ibDevN, ibDev->devName, ibDev->portNum, NCCL_IB_LLSTR(ibDev->portAttr.link_layer), ibDev0, ncclIbDevs[ibDev0].devName, ncclIbDevs[ibDev0].portNum, NCCL_IB_LLSTR(link_layer)); + return ncclInternalError; + } } meta.fifoAddr = (uint64_t)comm->fifo; + meta.sl = (ncclParamIbSl() != -1) ? ncclParamIbSl() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_SL_DEFAULT; + meta.tc = (ncclParamIbTc() != -1) ? ncclParamIbTc() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_TC_DEFAULT; strncpy(meta.devName, mergedDev->devName, MAX_MERGED_DEV_NAME); stage->state = ncclIbCommStateSend; @@ -1332,13 +1367,16 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet comm->base.nRemDevs = remMeta.ndevs; - int link_layer; - link_layer = remMeta.devs[0].link_layer; - for (int i = 1; i < remMeta.ndevs; i++) { - if (remMeta.devs[i].link_layer != link_layer) { - WARN("NET/IB : Can't connect net devices with different link_layer. i=%d remMeta.ndevs=%d link_layer=%d rem_link_layer=%d", - i, remMeta.ndevs, link_layer, remMeta.devs[i].link_layer); - return ncclInternalError; + // ensure that the remote devices have the same link layer than the local devices used in the connection. + if (comm->base.vProps.ndevs > 0) { + int ibDev0 = comm->devs[0].base.ibDevN; + link_layer = ncclIbDevs[ibDev0].portAttr.link_layer; + for (int i = 0; i < remMeta.ndevs; i++) { + if (remMeta.devs[i].link_layer != link_layer) { + WARN("NET/IB : Remote %s device is incompatible with the local [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA", + NCCL_IB_LLSTR(remMeta.devs[i].link_layer), ibDev0, ncclIbDevs[ibDev0].devName, ncclIbDevs[ibDev0].portNum, NCCL_IB_LLSTR(link_layer)); + return ncclInternalError; + } } } @@ -1373,7 +1411,7 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN; remDevInfo->mtu = std::min(remDevInfo->mtu, ibDev->portAttr.active_mtu); - NCCLCHECKGOTO(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false), ret, fail); + NCCLCHECKGOTO(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false, remMeta.tc, remMeta.sl), ret, fail); NCCLCHECKGOTO(ncclIbRtsQp(qp), ret, fail); } @@ -1459,6 +1497,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle struct ncclIbCommStage* stage = &lComm->stage; struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm; int ready; + int link_layer = IBV_LINK_LAYER_UNSPECIFIED; *recvComm = NULL; if (stage->state == ncclIbCommStateAccept) goto ib_accept_check; @@ -1497,7 +1536,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle ncclNetVDeviceProps_t remoteVProps; memcpy(&remoteVProps, stage->buffer, sizeof(ncclNetVDeviceProps_t)); if (lComm->dev >= ncclNMergedIbDevs) { - WARN("NET/IB : Trying to use non-existant virtual device %d", lComm->dev); + WARN("NET/IB : Trying to use non-existent virtual device %d", lComm->dev); return ncclInternalError; } @@ -1555,6 +1594,13 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle ibDev = ncclIbDevs + ibDevN; NCCLCHECKGOTO(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex), ret, fail); NCCLCHECKGOTO(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, rCommDev->base.gidInfo.localGidIndex, &rCommDev->base.gidInfo.localGid), ret, fail); + if (link_layer == IBV_LINK_LAYER_UNSPECIFIED) link_layer = ibDev->portAttr.link_layer; + if (link_layer != ibDev->portAttr.link_layer) { + int ibDev0 = rComm->devs[0].base.ibDevN; + WARN("NET/IB : Attempted to connect incompatible devices: [%d]%s:%d/%s and [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA", + ibDevN, ibDev->devName, ibDev->portNum, NCCL_IB_LLSTR(ibDev->portAttr.link_layer), ibDev0, ncclIbDevs[ibDev0].devName, ncclIbDevs[ibDev0].portNum, NCCL_IB_LLSTR(link_layer)); + return ncclInternalError; + } } // Copy remDevInfo for things like remGidInfo, remFifoAddr, etc. @@ -1562,6 +1608,12 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle rComm->base.remDevs[i] = remMeta.devs[i]; rComm->base.remDevs[i].remoteGid.global.interface_id = rComm->base.remDevs[i].gid.global.interface_id; rComm->base.remDevs[i].remoteGid.global.subnet_prefix = rComm->base.remDevs[i].gid.global.subnet_prefix; + if (remMeta.devs[i].link_layer != link_layer) { + int ibDev0 = rComm->devs[0].base.ibDevN; + WARN("NET/IB : Remote %s device is incompatible with the local [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA", + NCCL_IB_LLSTR(remMeta.devs[i].link_layer), ibDev0, ncclIbDevs[ibDev0].devName, ncclIbDevs[ibDev0].portNum, NCCL_IB_LLSTR(link_layer)); + return ncclInternalError; + } } // Stripe QP creation across merged devs @@ -1598,7 +1650,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle meta.qpInfo[q].ece_supported = 0; } - NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, true), ret, fail); + NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, true, remMeta.tc, remMeta.sl), ret, fail); NCCLCHECKGOTO(ncclIbRtsQp(qp->qp), ret, fail); } @@ -1629,7 +1681,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle devInfo.gid.global.subnet_prefix = rCommDev->base.gidInfo.localGid.global.subnet_prefix; devInfo.gid.global.interface_id = rCommDev->base.gidInfo.localGid.global.interface_id; devInfo.mtu = ibDev->portAttr.active_mtu; - NCCLCHECKGOTO(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false), ret, fail); + NCCLCHECKGOTO(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false, remMeta.tc, remMeta.sl), ret, fail); NCCLCHECKGOTO(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp), ret, fail); } @@ -1646,6 +1698,8 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle meta.devs[i].fifoRkey = rComm->devs[i].sizesFifoMr->rkey; } meta.fifoAddr = (uint64_t)rComm->sizesFifo; + meta.sl = remMeta.sl; + meta.tc = remMeta.tc; for (int q = 0; q < rComm->base.nqps; q++) { meta.qpInfo[q].qpn = rComm->base.qps[q].qp->qp_num; @@ -1842,7 +1896,7 @@ ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) { NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 0); -ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { +ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot, void* pHandle) { struct ncclIbRequest** reqs = comm->fifoReqs[slot]; volatile struct ncclIbSendFifo* slots = comm->fifo[slot]; int nreqs = slots[0].nreqs; @@ -1860,6 +1914,9 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { wr->wr.rdma.remote_addr = slots[r].addr; wr->next = wr + 1; wr_id += (reqs[r] - comm->base.reqs) << (r*8); +#ifdef NCCL_ENABLE_NET_PROFILING + reqs[r]->pInfo[0].nEventHandles = 0; +#endif } // Write size as immediate data. In the case of multi-send, only write @@ -1929,6 +1986,24 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { } struct ibv_send_wr* bad_wr; +#ifdef NCCL_ENABLE_NET_PROFILING + // QP profiling loop + for (int r=0; rpInfo[0].nEventHandles; + reqs[r]->pInfo[0].qpIndex[nEventHandles%MAX_QPS_PER_REQ] = qpIndex; + // Store info for profiler + int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER; + reqs[r]->pInfo[0].data.type = ncclProfileQp; + reqs[r]->pInfo[0].data.qp.device = devIndex; + reqs[r]->pInfo[0].data.qp.wr_id = comm->wrs[r].wr_id; + reqs[r]->pInfo[0].data.qp.opcode = comm->wrs[r].opcode; + reqs[r]->pInfo[0].data.qp.qpNum = qp->qp->qp_num; + reqs[r]->pInfo[0].data.qp.length = comm->sges[r].length; + NCCLCHECK(ncclProfilerFunction(&reqs[r]->pInfo[0].qpEventHandles[nEventHandles%MAX_QPS_PER_REQ], 0, pHandle, pluginId, &reqs[r]->pInfo[0].data)); + reqs[r]->pInfo[0].nEventHandles++; + } +#endif NCCLCHECK(wrap_ibv_post_send(qp->qp, comm->wrs, &bad_wr)); for (int r=0; rbase.ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0"); return ncclInternalError; } if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; } @@ -2018,7 +2093,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void* } TIME_START(0); - NCCLCHECK(ncclIbMultiSend(comm, slot)); + NCCLCHECK(ncclIbMultiSend(comm, slot, phandle)); // Clear slots[0]->nreqs, as well as other fields to help debugging and sanity checks memset((void*)slots, 0, sizeof(struct ncclIbSendFifo)); @@ -2109,7 +2184,7 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz return ncclSuccess; } -ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { +ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0"); return ncclInternalError; } if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; } @@ -2121,6 +2196,9 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* req->type = NCCL_NET_IB_REQ_RECV; req->sock = &comm->base.sock; req->nreqs = n; +#ifdef NCCL_ENABLE_NET_PROFILING + for (int r = 0; r < n && phandles; r++) req->pInfo[r].nEventHandles = 0; +#endif for (int i = 0; i < comm->base.vProps.ndevs; i++) { req->devBases[i] = &comm->devs[i].base; @@ -2141,6 +2219,19 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* for (int i = 0; i < nqps; i++) { struct ncclIbQp* qp = comm->base.qps + comm->base.qpIndex; ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base); +#ifdef NCCL_ENABLE_NET_PROFILING + // Start a QP event for every request in the multirecv and every qp + for (int r = 0; r < n && phandles; r++) { + // Store info for profiler + int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER; + req->pInfo[r].data.type = ncclProfileQp; + req->pInfo[r].data.qp.device = qp->devIndex; + req->pInfo[r].data.qp.wr_id = wr.wr_id; + req->pInfo[r].data.qp.qpNum = qp->qp->qp_num; + NCCLCHECK(ncclProfilerFunction(&req->pInfo[r].qpEventHandles[i], 0, phandles[r], pluginId, &req->pInfo[r].data)); + req->pInfo[r].nEventHandles++; + } +#endif NCCLCHECK(wrap_ibv_post_recv(qp->qp, &wr, &bad_wr)); comm->base.qpIndex = (comm->base.qpIndex+1)%comm->base.nqps; } @@ -2196,6 +2287,16 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void** #define HCA_NAME(req, index) ((req)->devBases[(index)]->pd->context->device->name) +#ifdef NCCL_ENABLE_NET_PROFILING +static int getReqQpIndex(struct ncclIbRequest* req, int request, int qpNumber) { + for (int i = 0; i < MAX_QPS_PER_REQ; i++) { + int qpIndex = req->pInfo[request].qpIndex[i]; + if (req->base->qps[qpIndex].qp->qp_num == qpNumber) return i; + } + return 0; +} +#endif + ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { struct ncclIbRequest *r = (struct ncclIbRequest*)request; *done = 0; @@ -2205,11 +2306,24 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { TRACE(NCCL_NET, "r=%p done", r); *done = 1; if (sizes && r->type == NCCL_NET_IB_REQ_RECV) { - for (int i=0; inreqs; i++) sizes[i] = r->recv.sizes[i]; + for (int i=0; inreqs; i++) { + sizes[i] = r->recv.sizes[i]; +#ifdef NCCL_ENABLE_NET_PROFILING + for (int j = 0; j < r->pInfo[i].nEventHandles; j++) { + NCCLCHECK(ncclProfilerFunction(&r->pInfo[i].qpEventHandles[j], 1, NULL, 0, NULL)); + } +#endif + } } if (sizes && r->type == NCCL_NET_IB_REQ_SEND) { sizes[0] = r->send.size; +#ifdef NCCL_ENABLE_NET_PROFILING + for (int j = 0; j < r->pInfo[0].nEventHandles; j++) { + NCCLCHECK(ncclProfilerFunction(&r->pInfo[0].qpEventHandles[j], 1, NULL, 0, NULL)); + } +#endif } + // Stop all remaining Qp events for this event NCCLCHECK(ncclIbFreeRequest(r)); return ncclSuccess; } @@ -2264,6 +2378,10 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { return ncclInternalError; } sendReq->events[i]--; +#ifdef NCCL_ENABLE_NET_PROFILING + // Stop Qp event for sendReq + NCCLCHECK(ncclProfilerFunction(&sendReq->pInfo[j].qpEventHandles[getReqQpIndex(sendReq, j, wc->qp_num)], 1, NULL, 0, NULL)); +#endif } } else { if (req && wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) { @@ -2276,6 +2394,12 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { } } req->events[i]--; +#ifdef NCCL_ENABLE_NET_PROFILING + // Stop Qp event for workFifo + for (int j = 0; j < req->nreqs; j++) { + NCCLCHECK(ncclProfilerFunction(&req->pInfo[j].qpEventHandles[getReqQpIndex(req, j, wc->qp_num)], 1, NULL, 0, NULL)); + } +#endif } } // Once the IB fatal event is reported in the async thread, we want to propagate this error diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc index 235dee865..8034d95fe 100644 --- a/src/transport/net_socket.cc +++ b/src/transport/net_socket.cc @@ -9,6 +9,7 @@ #include "socket.h" #include "net.h" #include "param.h" +#include "profiler/net_socket.h" #include #include @@ -35,7 +36,10 @@ static ncclResult_t ncclNetSocketGetPciPath(char* devName, char** pciPath) { return ncclSuccess; } -ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction) { +static ncclProfilerCallback_t ncclProfilerFunction; + +ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { + ncclProfilerFunction = profFunction; if (ncclNetIfs == -1) { pthread_mutex_lock(&ncclNetSocketLock); if (ncclNetIfs == -1) { @@ -158,6 +162,11 @@ struct ncclNetSocketTask { ncclResult_t result; }; +struct ncclProfilerInfo { + void* eHandle; + void* pHandle; +}; + struct ncclNetSocketRequest { int op; void* data; @@ -168,6 +177,7 @@ struct ncclNetSocketRequest { struct ncclNetSocketComm* comm; struct ncclNetSocketTask* tasks[MAX_SOCKETS]; int nSubs; + struct ncclProfilerInfo pInfo; }; struct ncclNetSocketTaskQueue { @@ -180,6 +190,7 @@ struct ncclNetSocketThreadResources { struct ncclNetSocketTaskQueue threadTaskQueue; int stop; struct ncclNetSocketComm* comm; + struct ncclProfilerInfo* pInfo; pthread_mutex_t threadLock; pthread_cond_t threadCond; }; @@ -210,6 +221,9 @@ void* persistentSocketThread(void *args_) { struct ncclNetSocketComm* comm = resource->comm; struct ncclNetSocketTaskQueue* myQueue = &resource->threadTaskQueue; int nSocksPerThread = comm->nSocks / comm->nThreads; +#ifdef NCCL_ENABLE_NET_PROFILING + void* eHandle[MAX_REQUESTS*MAX_SOCKETS] = { 0 }; +#endif while (1) { int idle = 1; int mark = myQueue->next; // mark newest task seen @@ -220,13 +234,33 @@ void* persistentSocketThread(void *args_) { for (int j=0; jtasks+i+j; if (r != NULL && r->used == 1 && r->offset < r->size) { +#ifdef NCCL_ENABLE_NET_PROFILING + if (!eHandle[i+j]) { + ncclProfilerNetSockDescr_v1_t data; + data.type = ncclProfileSocket; + data.sock.fd = r->sock->fd; + data.sock.op = r->op; + data.sock.length = r->size; + ncclProfilerFunction(&eHandle[i+j], 0, resource->pInfo->pHandle, NCCL_PROFILER_NET_TYPE_SOCK | 1, &data); + } +#endif r->result = ncclSocketProgress(r->op, r->sock, r->data, r->size, &r->offset); if (r->result != ncclSuccess) { +#ifdef NCCL_ENABLE_NET_PROFILING + ncclProfilerFunction(&eHandle[i+j], 1, NULL, 0, NULL); + eHandle[i+j] = NULL; +#endif WARN("NET/Socket : socket progress error"); return NULL; } idle = 0; if (r->offset < r->size) repeat = 1; +#ifdef NCCL_ENABLE_NET_PROFILING + if (repeat == 0) { + ncclProfilerFunction(&eHandle[i+j], 1, NULL, 0, NULL); + eHandle[i+j] = NULL; + } +#endif } } } while (repeat); @@ -326,7 +360,7 @@ ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) goto exit; } -ncclResult_t ncclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { +ncclResult_t ncclNetSocketConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev return ncclInternalError; } @@ -444,7 +478,7 @@ ncclResult_t ncclNetSocketGetRequest(struct ncclNetSocketComm* comm, int op, voi return ncclInternalError; } -ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, int op, void* data, int size, struct ncclNetSocketTask** req) { +ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, struct ncclProfilerInfo* pInfo, int op, void* data, int size, struct ncclNetSocketTask** req) { int tid = comm->nextSock % comm->nThreads; struct ncclNetSocketThreadResources* res = comm->threadResources+tid; struct ncclNetSocketTaskQueue* queue = &res->threadTaskQueue; @@ -457,6 +491,9 @@ ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, int op, void* NCCLCHECK(ncclCalloc(&queue->tasks, queue->len)); queue->next = 0; res->comm = comm; +#ifdef NCCL_ENABLE_NET_PROFILING + res->pInfo = pInfo; +#endif pthread_mutex_init(&res->threadLock, NULL); pthread_cond_init(&res->threadCond, NULL); PTHREADCHECK(pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res), "pthread_create"); @@ -520,7 +557,7 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) { int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks)); while (chunkOffset < r->size) { int chunkSize = std::min(taskSize, r->size-chunkOffset); - NCCLCHECK(ncclNetSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++)); + NCCLCHECK(ncclNetSocketGetTask(r->comm, &r->pInfo, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++)); chunkOffset += chunkSize; } } @@ -544,6 +581,16 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) { } } } else { // progress request using main thread +#ifdef NCCL_ENABLE_NET_PROFILING + if (!r->pInfo.eHandle) { + ncclProfilerNetSockDescr_v1_t data; + data.type = ncclProfileSocket; + data.sock.fd = r->ctrlSock->fd; + data.sock.op = r->op; + data.sock.length = r->size; + ncclProfilerFunction(&r->pInfo.eHandle, 0, r->pInfo.pHandle, NCCL_PROFILER_NET_TYPE_SOCK | 1, &data); + } +#endif if (r->offset < r->size) { NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, r->data, r->size, &r->offset)); } @@ -551,6 +598,10 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) { if (size) *size = r->size; *done = 1; r->used = 0; +#ifdef NCCL_ENABLE_NET_PROFILING + ncclProfilerFunction(&r->pInfo.eHandle, 1, NULL, 0, NULL); + r->pInfo.eHandle = NULL; +#endif } } } @@ -562,16 +613,26 @@ ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, v } ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; } -ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { +ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) { struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)sendComm; NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_SEND, data, (int) size, (struct ncclNetSocketRequest**)request)); +#ifdef NCCL_ENABLE_NET_PROFILING + // NCCL core profiler callback + struct ncclNetSocketRequest* req = *(struct ncclNetSocketRequest **)request; + req->pInfo.pHandle = phandle; +#endif return ncclSuccess; } -ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { +ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) { struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)recvComm; if (n != 1) return ncclInternalError; NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], (int)sizes[0], (struct ncclNetSocketRequest**)request)); +#ifdef NCCL_ENABLE_NET_PROFILING + // NCCL core profiler callback + struct ncclNetSocketRequest* req = *(struct ncclNetSocketRequest **)request; + if (phandles) req->pInfo.pHandle = phandles[0]; +#endif return ncclSuccess; } diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc index 3fe25a324..d99f7cb3e 100644 --- a/src/transport/nvls.cc +++ b/src/transport/nvls.cc @@ -108,29 +108,29 @@ ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAll return ncclSuccess; } -ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) { - CUCHECK(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, size)); - CUCHECK(cuMemUnmap(ptr, size)); - CUCHECK(cuMemAddressFree(ptr, size)); +ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t ucsize, size_t mcsize) { + CUCHECK(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, ucsize)); + CUCHECK(cuMemUnmap(ptr, mcsize)); + CUCHECK(cuMemAddressFree(ptr, mcsize)); CUCHECK(cuMemRelease(*mcHandler)); - INFO(NCCL_NVLS, "rank %d - NVLS deregistered buffer %p on device %d, size %ld", comm->rank, (void*)ptr, dev, size); + INFO(NCCL_NVLS, "rank %d - NVLS deregistered buffer %p on device %d ucsize %ld mcsize %ld", comm->rank, (void*)ptr, dev, ucsize, mcsize); return ncclSuccess; } -ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t size, void* ucptr, CUmemGenericAllocationHandle* ucHandle, void* mcptr, CUmemGenericAllocationHandle* mcHandle) { - INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)", *ucHandle, ucptr, *mcHandle, mcptr); +ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t ucsize, void* ucptr, CUmemGenericAllocationHandle* ucHandle, size_t mcsize, void* mcptr, CUmemGenericAllocationHandle* mcHandle) { + INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) ucsize %zu MC handle 0x%llx(%p) mcsize %zd", *ucHandle, ucptr, ucsize, *mcHandle, mcptr, mcsize); // Release the UC memory and mapping if (ucptr) { - CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, size)); - CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, size)); + CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, ucsize)); + CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, ucsize)); CUCHECK(cuMemRelease(*ucHandle)); } // Release the MC memory and mapping if (mcptr) { - CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, size)); - CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, size)); + CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, mcsize)); + CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, mcsize)); CUCHECK(cuMemRelease(*mcHandle)); } @@ -197,25 +197,27 @@ ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm) { goto exit; } -static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularity_flags mcOption, const CUmemAccessDesc* desc, size_t* sizePtr, CUmemGenericAllocationHandle* ucHandle, CUmemGenericAllocationHandle* mcHandle, void** ucptr, void** mcptr) { +static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, const CUmemAccessDesc* desc, size_t size, CUmemGenericAllocationHandle* ucHandle, CUmemGenericAllocationHandle* mcHandle, void** ucptr, void** mcptr, size_t* ucsizePtr, size_t* mcsizePtr) { char shareableHandle[NVLS_HANDLE_SIZE]; CUmulticastObjectProp mcprop; CUmemAllocationProp ucprop; ncclResult_t ret = ncclSuccess; - size_t size = *sizePtr; - size_t originSize = size; + size_t mcsize; + size_t ucsize; size_t ucgran, mcgran; int allocMcHandle = 0; + mcsize = ucsize = size; *ucptr = *mcptr = NULL; + memset(shareableHandle, '\0', sizeof(shareableHandle)); memset(&mcprop, 0, sizeof(CUmulticastObjectProp)); mcprop.numDevices = comm->localRanks; mcprop.handleTypes = ncclCuMemHandleType; mcprop.flags = 0; mcprop.size = size; - CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, mcOption), ret, fail); - ALIGN_SIZE(size, mcgran); - *sizePtr = mcprop.size = size; + CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail); + ALIGN_SIZE(mcsize, mcgran); + mcprop.size = mcsize; if (comm->localRank == 0) { NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, mcHandle, shareableHandle), ret, fail); @@ -235,26 +237,29 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit ucprop.location.id = comm->cudaDev; ucprop.requestedHandleTypes = ncclCuMemHandleType; CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail); - // Map a VA for UC memory - CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)ucptr, size, ucgran, 0U, 0), ret, fail); + ALIGN_SIZE(ucsize, ucgran); + // Map a VA for UC memory with MC alignment and size + CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)ucptr, ucsize, ucgran, 0U, 0), ret, fail); // Alloc local physical mem for this NVLS group - CUCHECKGOTO(cuMemCreate(ucHandle, size, &ucprop, 0), ret, fail); - CUCHECKGOTO(cuMemMap((CUdeviceptr)*ucptr, size, 0, *ucHandle, 0), ret, fail); - CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, size, desc, 1), ret, fail); - CUDACHECKGOTO(cudaMemset(*ucptr, 0, size), ret, fail); + CUCHECKGOTO(cuMemCreate(ucHandle, ucsize, &ucprop, 0), ret, fail); + CUCHECKGOTO(cuMemMap((CUdeviceptr)*ucptr, ucsize, 0, *ucHandle, 0), ret, fail); + CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, ucsize, desc, 1), ret, fail); + CUDACHECKGOTO(cudaMemset(*ucptr, 0, ucsize), ret, fail); // intra-node barrier to mitigate the possible hang in cuMulticastBindMem during abort NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail); // Bind physical memory to the Multicast group // NB: It will block until all ranks have been added to the Group - CUCHECKGOTO(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, size, 0/*flags*/), ret, fail); + CUCHECKGOTO(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, ucsize, 0/*flags*/), ret, fail); // Map mc virtual address - CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)mcptr, size, mcgran, 0U, 0), ret, fail); - CUCHECKGOTO(cuMemMap((CUdeviceptr)*mcptr, size, 0, *mcHandle, 0), ret, fail); - CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*mcptr, size, desc, 1), ret, fail); - INFO(NCCL_NVLS, "NVLS rank %d (dev %d) alloc done, ucptr %p ucgran %ld mcptr %p mcgran %ld size %ld (%ld)", comm->rank, comm->cudaDev, *ucptr, ucgran, *mcptr, mcgran, size, originSize); + CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)mcptr, mcsize, mcgran, 0U, 0), ret, fail); + CUCHECKGOTO(cuMemMap((CUdeviceptr)*mcptr, mcsize, 0, *mcHandle, 0), ret, fail); + CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*mcptr, mcsize, desc, 1), ret, fail); + *ucsizePtr = ucsize; + *mcsizePtr = mcsize; + INFO(NCCL_NVLS, "NVLS rank %d (dev %d) alloc done, ucptr %p ucgran %ld mcptr %p mcgran %ld ucsize %ld mcsize %ld (inputsize %ld)", comm->rank, comm->cudaDev, *ucptr, ucgran, *mcptr, mcgran, ucsize, mcsize, size); exit: return ret; @@ -273,6 +278,7 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) { size_t nvlsTotalSize = 0; struct ncclNvlsSharedRes* resources = NULL; int nChannels = -1; + cudaStream_t deviceStream, hostStream; if (comm->nvlsSupport == 0 || comm->nvlsResources->inited) return ncclSuccess; // initialize after checking comm->nvlsSupport @@ -288,10 +294,10 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) { INFO(NCCL_INIT | NCCL_NVLS, "NVLS comm %p headRank %d nHeads %d buffSize %zu nvlsPerRankSize %zu nvlsTotalSize %zu", comm, headRank, nHeads, buffSize, nvlsPerRankSize, nvlsTotalSize); - NCCLCHECKGOTO(nvlsAllocateMem(comm, CU_MULTICAST_GRANULARITY_RECOMMENDED, &resources->accessDesc, &nvlsTotalSize, &resources->ucBuffHandle, &resources->mcBuffHandle, (void**)&resources->ucBuff, (void**)&resources->mcBuff), res, fail); - resources->buffSize = nvlsTotalSize; + NCCLCHECKGOTO(nvlsAllocateMem(comm, &resources->accessDesc, nvlsTotalSize, &resources->ucBuffHandle, &resources->mcBuffHandle, (void**)&resources->ucBuff, (void**)&resources->mcBuff, &resources->buffUCSize, &resources->buffMCSize), res, fail); - NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), res, fail); + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), res, fail); + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), res, fail); for (int h = 0; h < nHeads; h++) { int nvlsPeer = comm->nRanks + 1 + h; for (int c = 0; c < nChannels; c++) { @@ -306,15 +312,16 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) { peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = resources->ucBuff + ((h * 2 + 1) * nChannels + c) * buffSize; peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = resources->mcBuff + ((h * 2 + 1) * nChannels + c) * buffSize; - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail); } } - NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), res, fail); - NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), res, fail); + NCCLCHECKGOTO(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent), res, fail); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), res, fail); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), res, fail); // For now, the barrier is a must that guarantees all buffers are mc-mapped before accessing peer's buffer NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, fail); comm->nvlsResources->inited = true; @@ -374,6 +381,7 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { size_t memSize = 64; size_t creditSize = nChannels * 2 * memSize * nHeads; int nvlsStepSize = comm->nvlsChunkSize; + cudaStream_t hostStream, deviceStream; NCCLCHECKGOTO(ncclCalloc(&comm->nvlsResources, 1), res, fail); comm->nvlsResources->inited = false; @@ -398,11 +406,11 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { resources->accessDesc.location.id = comm->cudaDev; resources->dev = comm->cudaDev; - NCCLCHECKGOTO(nvlsAllocateMem(comm, CU_MULTICAST_GRANULARITY_MINIMUM, &resources->accessDesc, &creditSize, &resources->ucCreditHandle, &resources->mcCreditHandle, (void**)&resources->ucCredit, (void**)&resources->mcCredit), res, fail); - resources->creditSize = creditSize; + NCCLCHECKGOTO(nvlsAllocateMem(comm, &resources->accessDesc, creditSize, &resources->ucCreditHandle, &resources->mcCreditHandle, (void**)&resources->ucCredit, (void**)&resources->mcCredit, &resources->creditUCSize, &resources->creditMCSize), res, fail); // Set up head and tail only for now - NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), res, fail); + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), res, fail); + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), res, fail); for (int h = 0; h < nHeads; h++) { int nvlsPeer = comm->nRanks + 1 + h; for (int c = 0; c < nChannels; c++) { @@ -440,14 +448,15 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { peer->send[0].conn.stepSize = nvlsStepSize; peer->send[0].conn.flags |= NCCL_NVLS_MIN_POLL; - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); - CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail); + CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail); } } - NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), res, fail); - NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), res, fail); + NCCLCHECKGOTO(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent), res, fail); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), res, fail); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), res, fail); } // MNNVL does not support NVLS buffer registration @@ -488,13 +497,13 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) { NCCLCHECK(ncclShmClose(resources->nvlsShmemHandle)); if (resources->ucCredit || resources->mcCredit) { - NCCLCHECK(nvlsGroupUnbind(comm, resources->creditSize, &resources->mcCreditHandle)); - NCCLCHECK(nvlsGroupUnmapMem(comm, resources->creditSize, resources->ucCredit, &resources->ucCreditHandle, resources->mcCredit, &resources->mcCreditHandle)); + NCCLCHECK(nvlsGroupUnbind(comm, resources->creditUCSize, &resources->mcCreditHandle)); + NCCLCHECK(nvlsGroupUnmapMem(comm, resources->creditUCSize, resources->ucCredit, &resources->ucCreditHandle, resources->creditMCSize, resources->mcCredit, &resources->mcCreditHandle)); } if (comm->nvlsResources->inited) { - NCCLCHECK(nvlsGroupUnbind(comm, resources->buffSize, &resources->mcBuffHandle)); - NCCLCHECK(nvlsGroupUnmapMem(comm, resources->buffSize, resources->ucBuff, &resources->ucBuffHandle, resources->mcBuff, &resources->mcBuffHandle)); + NCCLCHECK(nvlsGroupUnbind(comm, resources->buffUCSize, &resources->mcBuffHandle)); + NCCLCHECK(nvlsGroupUnmapMem(comm, resources->buffUCSize, resources->ucBuff, &resources->ucBuffHandle, resources->buffMCSize, resources->mcBuff, &resources->mcBuffHandle)); } free(resources); comm->nvlsResources = NULL; @@ -513,7 +522,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t size_t minSize = SIZE_MAX; struct localRegData* regData = NULL; cudaPointerAttributes attr; - size_t ucgran, mcgran; + size_t ucgran, mcgran, ucsize, mcsize; NCCLCHECKGOTO(ncclCalloc(®Data, comm->localRanks), ret, fail); @@ -538,13 +547,12 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail); CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)®Record->baseAddr, ®Record->baseSize, (CUdeviceptr)regRecord->addr), ret, fail); - if (regSize % mcgran == 0) { - regRecord->regSize = regSize; - } else { - regRecord->regSize = regRecord->baseSize - (regRecord->addr - regRecord->baseAddr); - } - - if (regRecord->addr % ucgran == 0 && regRecord->regSize % mcgran == 0) { + if (regRecord->addr % ucgran == 0) { + if (regSize % ucgran != 0) { + regRecord->regUCSize = ALIGN_SIZE(regSize, ucgran); + } else { + regRecord->regUCSize = regSize; + } regRecord->state |= NVLS_REG_POSSIBLE; memcpy(®Data[comm->localRank].reg, regRecord, sizeof(struct ncclReg)); regData[comm->localRank].offset = userBuff - regRecord->addr; @@ -564,13 +572,17 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t goto fail; } /* get minimal reg size of nvls buffers */ - if (minSize > regData[i].reg.regSize) - minSize = regData[i].reg.regSize; + if (minSize > regData[i].reg.regUCSize) + minSize = regData[i].reg.regUCSize; } /* start registration */ + mcsize = ucsize = minSize; mcprop.size = minSize; CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail); + ALIGN_SIZE(mcsize, mcgran); + mcprop.size = mcsize; + if (comm->localRank == 0) { NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &mcHandle, shareableHandle), ret, fail); NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); @@ -583,16 +595,17 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t // Coverity complains that regRecord could be NULL. That won't in practice be the case because we've already checked // (regData[i].reg.state & NVLS_REG_POSSIBLE) of all local ranks, which would catch it and bail out. // coverity[var_deref_op] - CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, minSize, 0), ret, fail); + CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, ucsize, 0), ret, fail); // Create a VA for the NVLS - CUCHECKGOTO(cuMemAddressReserve(®Ptr, minSize, mcgran, 0U, 0), ret, fail); + CUCHECKGOTO(cuMemAddressReserve(®Ptr, mcsize, mcgran, 0U, 0), ret, fail); // Map the VA locally - CUCHECKGOTO(cuMemMap(regPtr, minSize, 0, mcHandle, 0), ret, fail); - CUCHECKGOTO(cuMemSetAccess(regPtr, minSize, &comm->nvlsResources->accessDesc, 1), ret, fail); + CUCHECKGOTO(cuMemMap(regPtr, mcsize, 0, mcHandle, 0), ret, fail); + CUCHECKGOTO(cuMemSetAccess(regPtr, mcsize, &comm->nvlsResources->accessDesc, 1), ret, fail); regRecord->regAddr = regPtr; - regRecord->regSize = minSize; + regRecord->regUCSize = ucsize; + regRecord->regMCSize = mcsize; regRecord->dev = comm->nvlsResources->dev; regRecord->mcHandle = mcHandle; regRecord->state |= NVLS_REG_COMPLETE; @@ -706,7 +719,7 @@ static ncclResult_t nvlsRegisterBuffer(struct ncclComm *comm, const void *sendbu return ncclSuccess; fail: regBufUsed = 0; - WARN("rank %d failed to NVLS register sendbuff %p sendbuffSize %ld recvbuff %p recvbuffSize %ld", comm->rank, sendbuff, sendbuffSize, recvbuff, recvbuffSize); + INFO(NCCL_REG, "rank %d failed to NVLS register sendbuff %p sendbuffSize %ld recvbuff %p recvbuffSize %ld", comm->rank, sendbuff, sendbuffSize, recvbuff, recvbuffSize); goto exit; } @@ -843,7 +856,7 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send return ncclSuccess; } -ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) { +ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t ucsize, size_t mcsize) { return ncclSuccess; } diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index dac762157..aed84c588 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -407,6 +407,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st comm->peerInfo[intermediateRank].nvmlDev, useReadStr); } + memset(&req, '\0', sizeof(req)); req.size = sendSize; req.refcount = 0; if (P2P_SAME_PID((comm->peerInfo + info->rank), peerInfo) && (comm->peerInfo[info->rank].cudaDev != peerInfo->cudaDev)) req.refcount++; @@ -466,6 +467,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st info->rank = intermediateRank; } + memset(&req, '\0', sizeof(req)); req.size = recvSize; req.refcount = 0; if (P2P_SAME_PID((comm->peerInfo + info->rank), peerInfo) && (comm->peerInfo[info->rank].cudaDev != peerInfo->cudaDev)) req.refcount++; @@ -527,7 +529,7 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn if (useMemcpy) { // Attach to peer's SHM segment - NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->shm, (void**)&resources->devShm, &resources->desc)); + NCCLCHECK(ncclShmImportShareableBuffer(comm, info->rank, &info->desc, (void**)&resources->shm, (void**)&resources->devShm, &resources->desc)); recv->conn.tail = &resources->devShm->recvMem.tail; recv->conn.head = &resources->devShm->sendMem.head; @@ -634,7 +636,7 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st // Create a SHM segment for the peer to attach to shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem); - NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, shmSize, false, &proxyInfo->desc, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm)); + NCCLCHECK(ncclShmAllocateShareableBuffer(shmSize, false, &proxyInfo->desc, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm)); NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1)); memcpy(respBuff, proxyInfo, sizeof(struct p2pShmProxyInfo)); @@ -805,7 +807,7 @@ static ncclResult_t ipcRegisterBuffer(ncclComm* comm, const void* userbuff, size ncclResult_t ret = ncclSuccess; struct ncclIpcRegInfo* newInfo = NULL; uintptr_t* peerRmtAddrs = NULL; - bool legacyIpcCap = false; + int legacyIpcCap = 0; size_t baseSize = 0; void* baseAddr = NULL; bool needUpdate = false; @@ -916,13 +918,16 @@ ncclResult_t ret = ncclSuccess; if (type == NCCL_IPC_COLLECTIVE) { // for collective, store registered remote buffers into dev memory for future reference if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL || needUpdate) { - NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail); + cudaStream_t hostStream, deviceStream; + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail); if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL) - NCCLCHECKGOTO(ncclCudaCallocAsync(®Record->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail); + NCCLCHECKGOTO(ncclCudaCallocAsync(®Record->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, hostStream), ret, fail); if (needUpdate) - NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail); - NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail); - NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail); + NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, hostStream), ret, fail); + NCCLCHECKGOTO(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), ret, fail); } peerRmtAddrs = regRecord->regIpcAddrs.devPeerRmtAddrs; } else { @@ -941,7 +946,7 @@ ncclResult_t ret = ncclSuccess; *offsetOut = 0; *peerRmtAddrsOut = NULL; if (newInfo) free(newInfo); - WARN("rank %d failed to IPC register userbuff %p buffSize %ld nPeers %d isLegacyIpc %p", comm->rank, userbuff, buffSize, nPeers, isLegacyIpc); + INFO(NCCL_REG, "rank %d failed to IPC register userbuff %p buffSize %ld nPeers %d isLegacyIpc %d type %s", comm->rank, userbuff, buffSize, nPeers, isLegacyIpc ? *isLegacyIpc : -1, ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR ? "POSIX_FD" : "FABRIC"); goto exit; } diff --git a/src/transport/profiler.cc b/src/transport/profiler.cc new file mode 100644 index 000000000..3e32843aa --- /dev/null +++ b/src/transport/profiler.cc @@ -0,0 +1,55 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "transport.h" +#include "proxy.h" +#include "profiler.h" + +static ncclResult_t profilerProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + connection->proxyAppendPtr = &connection->proxyAppend; + connection->shared = 1; + return ncclSuccess; +} + +// The following ncclProxySubArgs are overloaded by the profiler progress function: +// - base : is set to the current value of workCounter[channelId] +// - posted : is set to sub->nsteps to indicate that the profiler has started the event +// - transmitted: is set to sub->nsteps to indicate that the profiler has stopped the event +static ncclResult_t profilerProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { + if (args->state == ncclProxyOpReady) { + for (int s = 0; s < args->nsubs; s++) { + struct ncclProxySubArgs* sub = args->subs + s; + sub->base = sub->workCounter; + sub->posted = sub->transmitted = 0; + } + args->state = ncclProxyOpProgress; + } + if (args->state == ncclProxyOpProgress) { + for (int s = 0; s < args->nsubs; s++) { + struct ncclProxySubArgs* sub = args->subs + s; + uint64_t* workStarted = (uint64_t *)sub->sendbuff; + uint64_t* workCompleted = (uint64_t *)sub->recvbuff; + if (sub->posted < sub->nsteps && sub->base <= workStarted[sub->channelId]) { + ncclProfilerStartKernelChEvent(args, s); + sub->posted = sub->nsteps; + continue; // allow events on every channel to start + } + if (sub->transmitted < sub->nsteps && sub->base <= workCompleted[sub->channelId]) { + ncclProfilerStopKernelChEvent(args, s); + sub->transmitted = sub->nsteps; + args->done++; + } + } + if (args->done == args->nsubs) args->state = ncclProxyOpNone; + } + return ncclSuccess; +} + +struct ncclTransport profilerTransport = { + "Prof", + NULL, + { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL, NULL, profilerProxyConnect, NULL, profilerProxyProgress, NULL, NULL } +}; diff --git a/src/transport/shm.cc b/src/transport/shm.cc index d2d6906e8..aa3e6c41b 100644 --- a/src/transport/shm.cc +++ b/src/transport/shm.cc @@ -18,6 +18,7 @@ struct shmBuffInfo { }; struct shmConnectInfo { + int rank; ncclShmIpcDesc_t desc; struct shmBuffInfo buf; }; @@ -120,6 +121,7 @@ static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* gr NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, myInfo->rank, &send->proxyConn)); NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, (void*)&req, sizeof(struct shmRequest), (void*)info, sizeof(struct shmConnectInfo))); + info->rank = comm->rank; resources->hostMem = (struct ncclSendMem*)info->buf.hptr; resources->devHostMem = (struct ncclSendMem*)info->buf.dptr; @@ -150,6 +152,7 @@ static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* gr NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, myInfo->rank, &recv->proxyConn)); NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, (void*)&req, sizeof(struct shmRequest), (void*)info, sizeof(struct shmConnectInfo))); + info->rank = comm->rank; resources->hostMem = (struct ncclRecvMem*)info->buf.hptr; resources->devHostMem = (struct ncclRecvMem*)info->buf.dptr; @@ -163,7 +166,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co struct shmSendResources* resources = (struct shmSendResources*)send->transportResources; char* buff; - NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc)); + NCCLCHECK(ncclShmImportShareableBuffer(comm, info->rank, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc)); buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem + 1) : (char*)(resources->devRemHostMem + 1); for (int p=0; pdesc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc)); + NCCLCHECK(ncclShmImportShareableBuffer(comm, info->rank, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc)); buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem + 1) : (char*)(resources->devRemHostMem + 1); for (int p=0; ptpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail); + NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail); memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t)); connection->transportResources = proxyInfo; exit: @@ -485,7 +488,7 @@ static ncclResult_t shmRecvProxySetup(struct ncclProxyConnection* connection, st struct shmProxyInfo* proxyInfo; NCCLCHECK(ncclCalloc(&proxyInfo, 1)); - NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail); + NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail); memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t)); connection->transportResources = proxyInfo; exit: @@ -517,9 +520,9 @@ static void initCeOperation() { } } -ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *desc, void **hptr, void **dptr) { - if (desc == NULL || hptr == NULL || tpProxyRank < -1) { - WARN("Invalid argument desc %p, hptr %p, tpProxyRank %d", desc, hptr, tpProxyRank); +ncclResult_t ncclShmAllocateShareableBuffer(size_t size, bool legacy, ncclShmIpcDesc_t *desc, void **hptr, void **dptr) { + if (desc == NULL || hptr == NULL) { + WARN("Invalid argument desc %p, hptr %p", desc, hptr); return ncclInvalidArgument; } #if CUDART_VERSION >= 12020 @@ -532,7 +535,6 @@ ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool l if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { // Return the native cuMem handle for later Export/Import via UDS memcpy(&desc->shmci.data, &handle, sizeof(handle)); - desc->shmci.tpProxyRank = tpProxyRank; } else { CUCHECK(cuMemExportToShareableHandle(&desc->shmci.handle, handle, type, 0)); } @@ -560,7 +562,7 @@ ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool l return ncclSuccess; } -ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut) { +ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, int proxyRank, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut) { if (comm == NULL || desc == NULL || hptr == NULL || descOut == NULL) { WARN("Invalid argument comm %p, desc %p, hptr %p, descOut %p", comm, desc, hptr, descOut); return ncclInvalidArgument; @@ -584,7 +586,7 @@ ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_ // UDS fd support int fd = -1; // Send cuMem handle to remote for conversion to an fd - NCCLCHECK(ncclProxyClientGetFdBlocking(comm, desc->shmci.tpProxyRank, &desc->shmci.data, &fd)); + NCCLCHECK(ncclProxyClientGetFdBlocking(comm, proxyRank, &desc->shmci.data, &fd)); CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type)); (void) close(fd); } else { @@ -625,7 +627,7 @@ ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_ descOut->shmci.ptr = *hptr = (void *)hostptr; descOut->legacy = false; if (dptr) *dptr = (void *)hostptr; - INFO(NCCL_SHM, "CUMEM imported shareable host buffer from tpProxyRank %d size %zi ptr %p, granularity %ld", desc->shmci.tpProxyRank, desc->shmci.size, descOut->shmci.ptr, granularity); + INFO(NCCL_SHM, "CUMEM imported shareable host buffer from proxyRank %d size %zi ptr %p, granularity %ld", proxyRank, desc->shmci.size, descOut->shmci.ptr, granularity); } else { char shmPath[SHM_PATH_MAX]; snprintf(shmPath, sizeof(shmPath), "/dev/shm/nccl-%s", desc->shmli.shmSuffix); From 145e67e70745c5f78f18334f82de29dbe59bde63 Mon Sep 17 00:00:00 2001 From: Giuseppe Congiu Date: Wed, 9 Apr 2025 09:02:40 -0700 Subject: [PATCH 07/21] Update ext-profiler example Sync ext-profiler example with 2.26.2. --- ext-profiler/README.md | 142 +++++++++++++++++++--- ext-profiler/example/Makefile | 2 +- ext-profiler/example/event.h | 41 ++++++- ext-profiler/example/nccl/net_ib_v1.h | 34 ++++++ ext-profiler/example/nccl/net_socket_v1.h | 32 +++++ ext-profiler/example/nccl/profiler.h | 51 +++++++- ext-profiler/example/nccl/profiler_net.h | 22 ++++ ext-profiler/example/nccl/profiler_v1.h | 16 ++- ext-profiler/example/nccl/profiler_v2.h | 44 +------ ext-profiler/example/nccl/profiler_v3.h | 119 ++++++++++++++++++ ext-profiler/example/plugin.c | 105 +++++++++++++++- ext-profiler/example/plugin.h | 13 ++ ext-profiler/example/print_event.c | 78 +++++++++++- 13 files changed, 621 insertions(+), 78 deletions(-) create mode 100644 ext-profiler/example/nccl/net_ib_v1.h create mode 100644 ext-profiler/example/nccl/net_socket_v1.h create mode 100644 ext-profiler/example/nccl/profiler_net.h create mode 100644 ext-profiler/example/nccl/profiler_v3.h create mode 100644 ext-profiler/example/plugin.h diff --git a/ext-profiler/README.md b/ext-profiler/README.md index 7ef44b2fa..2a4018c07 100644 --- a/ext-profiler/README.md +++ b/ext-profiler/README.md @@ -49,9 +49,9 @@ of newer ones. The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions from old API versions. It also provides error codes in `err.h`. -# API (v2) +# API (v3) -Below is the main `ncclProfiler_v2` struct. Each function is explained in later sections. +Below is the main `ncclProfiler_v3` struct. Each function is explained in later sections. ``` typedef struct { @@ -70,7 +70,7 @@ typedef struct { // - eDescr : pointer to ncclProfilerEventDescr_t object // Output // - eHandle: return event handle for supplied event descriptor object - ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr); + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr); // stopEvent - stop/finalize an event inside and event set // Input @@ -82,13 +82,13 @@ typedef struct { // - eHandle : handle to event object created through startEvent // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition // - eState : event state transition - ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs); + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs); // finalize - finalize the profiler plugin // Input // - context: opaque profiler context object ncclResult_t (*finalize)(void* context); -} ncclProfiler_v2_t; +} ncclProfiler_v3_t; ``` ## Error codes @@ -156,7 +156,6 @@ typedef struct { size_t count; // data count int root; // root rank const char* datatype; // string containing the name of the datatype - size_t trafficBytes; // number of transfer bytes uint8_t nMaxChannels; // max number of channels for this collective uint8_t nWarps; // number of GPU warps for this collective const char* algo; // string containing name of the algorithm for this collective @@ -185,12 +184,22 @@ typedef struct { struct { // proxyStep events metadata int step; // individual step in `ncclProxyOp` } proxyStep; + + struct { + uint8_t channelId; // id of the channel used by the kernel + } kernelCh; + + struct { + int64_t id; // net plugin id (used by net and profiler plugins to agree on event definitions) + void* data; // pointer to network plugin defined event + } netPlugin; }; -} ncclProfilerEventDescr_v2_t; +} ncclProfilerEventDescr_v3_t; ``` NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`, -`ncclProfileProxyOp`, `ncclProfileProxyStep`, and `ncclProfileProxyCtrl`. +`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`, `ncclProfileKernelCh` and +`ncclProfileNetPlugin`. #### stopEvent @@ -236,7 +245,7 @@ typedef enum { ncclProfilerProxyCtrlWakeup, // state marks proxy progress thread waking up ncclProfilerProxyCtrlAppend, // state marks append of new network work item begin ncclProfilerProxyCtrlAppendEnd, // state marks append of new network work item end -} ncclProfilerEventState_v2_t; +} ncclProfilerEventState_v3_t; ``` `ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing @@ -251,6 +260,89 @@ the channel. Thus, they provide a more fine-grained view w.r.t. ProxyOp events. network requests for the GPU kernel. This includes everything else that the proxy thread might be doing, including appending new `ncclProxyOp` objects to the list of work elements to process. +`ncclProfileKernelCh` events are generated by the profiler proxy progress function while the kernel +processes work items for the enqueued NCCL operations. + +`ncclProfileNetPlugin` events are generated by the network plugin. Network plugins are free to define +their own set of events and communicate them to the profiler plugin using `ncclProfileNetPlugin` and +the `ncclProfilerCallback\_t` NCCL core callback. The network and profiler plugin can agree on the +network defined event definition using the plugin id in the event descriptor. The plugin identifier +is a 64-bit integer that has two parts: the 16 LSB are assigned to the plugin event version, the next +16 bits are assigned to the plugin type (NCCL\_PROFILER\_NET\_TYPE\_IB, ...). The rest of the bits are +unused and available for future extensions. + +A network IB plugin can use this infrastructure to define a QP event as: + +```C +#define NCCL_PROFILER_NET_IB_VER 1 + +enum { + ncclProfileQp = (1 << 0), +}; + +// The data structure version is encoded in the plugin identifier bitmask and +// passed to NCCL core through the profiler callback. NCCL copies the plugin +// identifier in the event descriptor before calling the profiler startEvent +// function. The profiler should inspect the plugin id to find out the source +// plugin as well as the version of the event struct +typedef struct { + uint8_t type; // event type (plugin defined) + union { + struct { + int device; // network device id + uint64_t wr_id; // work request id + int opcode; // ibv opcode + int qpNum; // QP number + size_t length; // work request data length + } qp; + }; +} ncclProfilerNetIbDescr_v1_t; +``` + +The network event infrastructure is network agnostic. A different network socket plugin can +use it to define a socket event as: + +```C +#define NCCL_PROFILER_NET_SOCKET_VER 1 + +enum { + ncclProfileSocket = (1 << 0), +}; + +// The data structure version is encoded in the plugin identifier bitmask and +// passed to NCCL core through the profiler callback. NCCL copies the plugin +// identifier in the event descriptor before calling the profiler startEvent +// function. The profiler should inspect the plugin id to find out the source +// plugin as well as the version of the event struct +typedef struct { + uint8_t type; // event type (plugin defined) + union { + struct { + int fd; + int op; + size_t length; + } sock; + }; +} ncclProfilerNetSockDescr_v1_t; +``` + +The network plugin creates an event (descriptor) and passes it to the profiler callback, +along with the network type and version (plugin id). NCCL then creates a `ncclProfileNetPlugin` +event descriptor, attaches the network plugin defined event as external data, and calls +the profiler `startEvent` function. + +```C +ncclResult_t isend(..., void* phandle, ...) { + ... + int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER; + ncclProfilerNetIbDescr_v1_t eDescr = { }; + eDescr.type = ncclProfileQp; + eDescr.qp = { ... }; + ncclProfilerCallback(&eHandle, 0 /* start net event */, phandle, pluginId, &eDescr); + ... +} +``` + State transitions for the events described can also come with event attribute updates. For this reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported below. @@ -264,7 +356,7 @@ typedef union { struct { // attributes to update for ncclProfileProxyCtrl int appendedProxyOps; // number of appended proxy ops thus far } proxyCtrl; -} ncclProfilerEventStateArgs_v2_t; +} ncclProfilerEventStateArgs_v3_t; ``` The example profiler in `ext-profiler/example` contains details on how to capture and use the events above. @@ -279,14 +371,22 @@ Group event +- Collective event | | | +- ProxyOp event - | | - | +- ProxyStep event + | | | + | | +- ProxyStep event + | | | + | | +- NetPlugin event + | | + | +- KernelCh event | +- Point-to-point event | +- ProxyOp event - | - +- ProxyStep event + | | + | +- ProxyStep event + | | + | +- NetPlugin event + | + +- KernelCh event ProxyCtrl event ``` @@ -316,3 +416,17 @@ thread originating the operation. To avoid the profiler instance in the remote p dereference a pointer from another address space the event descriptor includes the PID of the originator. The profiler plugin needs to check that the originator PID matches the local PID before dereferencing the parent event. + +# Known Limitations + +In intra-node communication, or whenever a rank does not have any network activity for which proxy events +are unavailable, the profiler will only report the enqueue events (e.g., ncclAllReduce). The events from +enqueue can be time stamped by the profiler (at start and stop) to reconstruct the execution time of the +collective. However, this time only represents the launch time of the collective and not the actual +execution time. To reconstruct the execution time more accurately proxy and kernel events are provided. + +Kernel events instrumentation leverages counters exposed by the kernel to the host and the proxy progress +thread. Thus, the proxy progress thread infrastructure is shared between the network and the profiler. If +the proxy is serving network requests the kernel profiling probing can be delayed, causing loss of +accuracy. Similarly, if the CPU is under heavy load and the scheduling of the proxy progress thread is +delayed, a similar loss of accuracy can be encountered. Keep this in mind when using kernel events. diff --git a/ext-profiler/example/Makefile b/ext-profiler/example/Makefile index ee8e0cf08..f5cc9f1d8 100644 --- a/ext-profiler/example/Makefile +++ b/ext-profiler/example/Makefile @@ -10,7 +10,7 @@ PLUGIN_SO := libnccl-profiler.so default: $(PLUGIN_SO) $(PLUGIN_SO): plugin.c event.c print_event.c - $(CC) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ + $(CXX) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ clean: rm -f $(PLUGIN_SO) diff --git a/ext-profiler/example/event.h b/ext-profiler/example/event.h index 1486a2248..0638f2df1 100644 --- a/ext-profiler/example/event.h +++ b/ext-profiler/example/event.h @@ -33,10 +33,42 @@ #define MAX_PROXY_OP_STATES ((NUM_PROXY_OP_SEND_STATES > NUM_PROXY_OP_RECV_STATES ) ? NUM_PROXY_OP_SEND_STATES : NUM_PROXY_OP_RECV_STATES) #define MAX_PROXY_STEP_STATES ((NUM_PROXY_STEP_SEND_STATES > NUM_PROXY_STEP_RECV_STATES) ? NUM_PROXY_STEP_SEND_STATES : NUM_PROXY_STEP_RECV_STATES) - -#define MAX_COMM_CLIQUES (32 * 8) +#define MAX_EVENTS_PER_REQ (8) struct proxyOp; +struct proxyStep; + +struct netPlugin { + uint8_t type; + int pluginType; + int pluginVer; + uint8_t pluginEvent; + union { + struct { + int device; + int qpNum; + int opcode; + uint64_t wr_id; + size_t length; + } qp; + struct { + int fd; + int op; + size_t length; + } sock; + }; + double startTs; + double stopTs; + struct proxyStep* parent; +}; + +struct kernelCh { + uint8_t type; + uint8_t channelId; + struct taskEventBase* parent; + double startTs; + double stopTs; +}; struct proxyStep { uint8_t type; // type of event: network transfer @@ -46,6 +78,8 @@ struct proxyStep { double startTs; double stopTs; struct proxyOp* parent; + struct netPlugin net[MAX_EVENTS_PER_REQ]; + int nNetEvents; }; struct proxyOp { @@ -101,7 +135,6 @@ struct collective { void const* sendBuff; void* recvBuff; size_t count; - size_t trafficBytes; int root; const char* datatype; uint8_t nMaxChannels; @@ -111,6 +144,7 @@ struct collective { struct proxyOp send[MAX_CHANNELS][MAX_OPS];// array of send proxy operation events struct proxyOp recv[MAX_CHANNELS][MAX_OPS];// array of recv proxy operation events int nProxyOps[MAX_CHANNELS]; + struct kernelCh kernel[MAX_CHANNELS]; }; struct p2p { @@ -121,6 +155,7 @@ struct p2p { const char* datatype; int peer; struct proxyOp op[MAX_CHANNELS]; + struct kernelCh kernel[MAX_CHANNELS]; }; struct group { diff --git a/ext-profiler/example/nccl/net_ib_v1.h b/ext-profiler/example/nccl/net_ib_v1.h new file mode 100644 index 000000000..f142de5f5 --- /dev/null +++ b/ext-profiler/example/nccl/net_ib_v1.h @@ -0,0 +1,34 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_IB_V1_H_ +#define NET_IB_V1_H_ + +#define NCCL_PROFILER_NET_IB_VER 1 + +enum { + ncclProfileQp = (1 << 0), +}; + +// The data structure version is encoded in the plugin identifier bitmask and +// passed to NCCL core through the profiler callback. NCCL copies the plugin +// identifier in the event descriptor before calling the profiler startEvent +// function. The profiler should inspect the plugin id to find out the source +// plugin as well as the version of the event struct +typedef struct { + uint8_t type; // event type (plugin defined) + union { + struct { + int device; // network device id + uint64_t wr_id; // work request id + int opcode; // ibv opcode + int qpNum; // QP number + size_t length; // work request data length + } qp; + }; +} ncclProfilerNetIbDescr_v1_t; + +#endif diff --git a/ext-profiler/example/nccl/net_socket_v1.h b/ext-profiler/example/nccl/net_socket_v1.h new file mode 100644 index 000000000..0cb664f20 --- /dev/null +++ b/ext-profiler/example/nccl/net_socket_v1.h @@ -0,0 +1,32 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NET_SOCKET_V1_H_ +#define NET_SOCKET_V1_H_ + +#define NCCL_PROFILER_NET_SOCKET_VER 1 + +enum { + ncclProfileSocket = (1 << 0), +}; + +// The data structure version is encoded in the plugin identifier bitmask and +// passed to NCCL core through the profiler callback. NCCL copies the plugin +// identifier in the event descriptor before calling the profiler startEvent +// function. The profiler should inspect the plugin id to find out the source +// plugin as well as the version of the event struct +typedef struct { + uint8_t type; // event type (plugin defined) + union { + struct { + int fd; + int op; + size_t length; + } sock; + }; +} ncclProfilerNetSockDescr_v1_t; + +#endif diff --git a/ext-profiler/example/nccl/profiler.h b/ext-profiler/example/nccl/profiler.h index 6680cfece..d02202d51 100644 --- a/ext-profiler/example/nccl/profiler.h +++ b/ext-profiler/example/nccl/profiler.h @@ -4,8 +4,8 @@ * See LICENSE.txt for license information ************************************************************************/ -#ifndef NCCL_PROFILER_H_ -#define NCCL_PROFILER_H_ +#ifndef PROFILER_H_ +#define PROFILER_H_ #include #include @@ -13,7 +13,54 @@ #include "common.h" #include "err.h" +enum { + ncclProfileGroup = (1 << 0), // group event type + ncclProfileColl = (1 << 1), // host collective call event type + ncclProfileP2p = (1 << 2), // host point-to-point call event type + ncclProfileProxyOp = (1 << 3), // proxy operation event type + ncclProfileProxyStep = (1 << 4), // proxy step event type + ncclProfileProxyCtrl = (1 << 5), // proxy control event type + ncclProfileKernelCh = (1 << 6), // kernel channel event type + ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events +}; + +typedef enum { + ncclProfilerProxyOpSendPosted, + ncclProfilerProxyOpSendRemFifoWait, + ncclProfilerProxyOpSendTransmitted, + ncclProfilerProxyOpSendDone, + ncclProfilerProxyOpRecvPosted, + ncclProfilerProxyOpRecvReceived, + ncclProfilerProxyOpRecvTransmitted, + ncclProfilerProxyOpRecvDone, + + /* Legacy proxy profiler states */ + ncclProfilerProxyStepSendGPUWait, + ncclProfilerProxyStepSendWait, + ncclProfilerProxyStepRecvWait, + ncclProfilerProxyStepRecvFlushWait, + ncclProfilerProxyStepRecvGPUWait, + + /* Legacy proxy control states */ + ncclProfilerProxyCtrlIdle, + ncclProfilerProxyCtrlActive, + ncclProfilerProxyCtrlSleep, + ncclProfilerProxyCtrlWakeup, + ncclProfilerProxyCtrlAppend, + ncclProfilerProxyCtrlAppendEnd, +} ncclProfilerEventState_t; + +typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t; +typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t; +typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t; + +#include "profiler_v3.h" #include "profiler_v2.h" #include "profiler_v1.h" +#include "profiler_net.h" + +typedef ncclProfiler_v3_t ncclProfiler_t; +typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t; +typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t; #endif // end include guard diff --git a/ext-profiler/example/nccl/profiler_net.h b/ext-profiler/example/nccl/profiler_net.h new file mode 100644 index 000000000..2d087ca54 --- /dev/null +++ b/ext-profiler/example/nccl/profiler_net.h @@ -0,0 +1,22 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_NET_H_ +#define PROFILER_NET_H_ + +#define NCCL_PROFILER_NET_VER_BITS (16) +#define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS) +#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS) + +typedef enum { + NCCL_PROFILER_NET_TYPE_IB = (1U << NCCL_PROFILER_NET_VER_BITS), + NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS), +} ncclProfilerNetType; + +#include "net_ib_v1.h" +#include "net_socket_v1.h" + +#endif diff --git a/ext-profiler/example/nccl/profiler_v1.h b/ext-profiler/example/nccl/profiler_v1.h index 7d34bed57..e7d316d48 100644 --- a/ext-profiler/example/nccl/profiler_v1.h +++ b/ext-profiler/example/nccl/profiler_v1.h @@ -4,8 +4,8 @@ * See LICENSE.txt for license information ************************************************************************/ -#ifndef NCCL_PROFILER_V1_H_ -#define NCCL_PROFILER_V1_H_ +#ifndef PROFILER_V1_H_ +#define PROFILER_V1_H_ #include @@ -59,8 +59,16 @@ typedef struct { }; } ncclProfilerEventDescr_v1_t; -typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t; -typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t; +typedef union { + struct { + size_t transSize; + int steps; + } proxyOp; + + struct { + int appendedProxyOps; + } proxyCtrl; +} ncclProfilerEventStateArgs_v1_t; typedef struct { const char* name; diff --git a/ext-profiler/example/nccl/profiler_v2.h b/ext-profiler/example/nccl/profiler_v2.h index aab4ccf86..4be600d52 100644 --- a/ext-profiler/example/nccl/profiler_v2.h +++ b/ext-profiler/example/nccl/profiler_v2.h @@ -4,20 +4,11 @@ * See LICENSE.txt for license information ************************************************************************/ -#ifndef NCCL_PROFILER_V2_H_ -#define NCCL_PROFILER_V2_H_ +#ifndef PROFILER_V2_H_ +#define PROFILER_V2_H_ #include -enum { - ncclProfileGroup = (1 << 0), // group event type - ncclProfileColl = (1 << 1), // host collective call event type - ncclProfileP2p = (1 << 2), // host point-to-point call event type - ncclProfileProxyOp = (1 << 3), // proxy operation event type - ncclProfileProxyStep = (1 << 4), // proxy step event type - ncclProfileProxyCtrl = (1 << 5), // proxy control event type -}; - typedef struct { uint8_t type; // event type descriptor: ncclProfileColl, ... void* parentObj; // pointer to the profiler parent object (for coll is the group) @@ -65,32 +56,6 @@ typedef struct { }; } ncclProfilerEventDescr_v2_t; -typedef enum { - ncclProfilerProxyOpSendPosted, - ncclProfilerProxyOpSendRemFifoWait, - ncclProfilerProxyOpSendTransmitted, - ncclProfilerProxyOpSendDone, - ncclProfilerProxyOpRecvPosted, - ncclProfilerProxyOpRecvReceived, - ncclProfilerProxyOpRecvTransmitted, - ncclProfilerProxyOpRecvDone, - - /* Legacy proxy profiler states */ - ncclProfilerProxyStepSendGPUWait, - ncclProfilerProxyStepSendWait, - ncclProfilerProxyStepRecvWait, - ncclProfilerProxyStepRecvFlushWait, - ncclProfilerProxyStepRecvGPUWait, - - /* Legacy proxy control states */ - ncclProfilerProxyCtrlIdle, - ncclProfilerProxyCtrlActive, - ncclProfilerProxyCtrlSleep, - ncclProfilerProxyCtrlWakeup, - ncclProfilerProxyCtrlAppend, - ncclProfilerProxyCtrlAppendEnd, -} ncclProfilerEventState_v2_t; - typedef union { struct { size_t transSize; @@ -138,9 +103,4 @@ typedef struct { ncclResult_t (*finalize)(void* context); } ncclProfiler_v2_t; -typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t; -typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t; -typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t; -typedef ncclProfiler_v2_t ncclProfiler_t; - #endif diff --git a/ext-profiler/example/nccl/profiler_v3.h b/ext-profiler/example/nccl/profiler_v3.h new file mode 100644 index 000000000..c1f1b919f --- /dev/null +++ b/ext-profiler/example/nccl/profiler_v3.h @@ -0,0 +1,119 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_V3_H_ +#define PROFILER_V3_H_ + +#include + +typedef struct { + uint8_t type; // event type descriptor: ncclProfileColl, ... + void* parentObj; // pointer to the profiler parent object (for coll is the group) + int rank; // originating rank + union { + struct { + const char* name; + uint64_t commHash; + uint64_t seqNumber; + const char* func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + const char* datatype; + uint8_t nMaxChannels; + uint8_t nWarps; + const char* algo; + const char* proto; + } coll; + + struct { + const char* name; + uint64_t commHash; + const char* func; + void* buff; + const char* datatype; + size_t count; + int peer; + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + + struct { + uint8_t channelId; + } kernelCh; + + struct { + int64_t id; + void* data; + } netPlugin; + }; +} ncclProfilerEventDescr_v3_t; + +typedef union { + struct { + size_t transSize; + int steps; + } proxyOp; + + struct { + int appendedProxyOps; + } proxyCtrl; +} ncclProfilerEventStateArgs_v3_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, int* eActivationMask); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v3_t; + +typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t; +typedef ncclProfilerEventState_v3_t ncclProfilerEventState_t; +typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t; +typedef ncclProfiler_v3_t ncclProfiler_t; + +#endif diff --git a/ext-profiler/example/plugin.c b/ext-profiler/example/plugin.c index 64d5d8be1..08408dba7 100644 --- a/ext-profiler/example/plugin.c +++ b/ext-profiler/example/plugin.c @@ -58,6 +58,7 @@ __hidden double gettime(void) { static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; static pid_t pid; +static int* eActivationMaskPtr; __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) { pthread_mutex_lock(&lock); @@ -65,7 +66,7 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) // first thread initializes event mask, environment and detach pool const char* str; str = getenv("NCCL_PROFILE_EVENT_MASK"); - __atomic_store_n(eActivationMask, str ? atoi(str) : defaultEActivationMask, __ATOMIC_RELAXED); + __atomic_store_n(eActivationMask, str ? atoi(str) : 0, __ATOMIC_RELAXED); str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE"); groupPoolSize = str ? atoi(str) : defaultGroupPoolSize; @@ -100,6 +101,9 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) } pthread_mutex_unlock(&lock); + // store pointer to activation mask globally + eActivationMaskPtr = eActivationMask; + // pre-allocate memory for event object pools in dedicated profiler context struct context* ctx = (struct context *)calloc(1, sizeof(*ctx)); ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool)); @@ -199,8 +203,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n if (base->type == ncclProfileColl) { struct collective* c = (struct collective *)base; // reset event proxyOps & proxySteps - memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS); - memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS); memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS); // release collective events in the group and return them to the collective pool __atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED); @@ -252,7 +254,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n event->count = eDescr->coll.count; event->root = eDescr->coll.root; event->datatype = eDescr->coll.datatype; - event->trafficBytes = eDescr->coll.trafficBytes; event->nMaxChannels = eDescr->coll.nMaxChannels; event->nWarps = eDescr->coll.nWarps; event->algo = eDescr->coll.algo; @@ -373,7 +374,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED); debugEvent(event, "ProxyOpStart"); } - } else if (eDescr->type == ncclProfileProxyStep) { + } else if (eDescr->type == ncclProfileProxyStep) { // the parent might be null if we run out of events struct proxyOp* parent = (struct proxyOp *)eDescr->parentObj; if (parent == NULL) return ncclSuccess; @@ -385,8 +386,77 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n event->isSend = parent->isSend; event->parent = parent; event->startTs = gettime() - startTime; + event->nNetEvents = 0; *eHandle = event; debugEvent(event, "ProxyStepStart"); + } else if (eDescr->type == ncclProfileKernelCh) { + struct taskEventBase* eventBase = (struct taskEventBase *)eDescr->parentObj; + if (eventBase == NULL) return ncclSuccess; + if (eventBase->type == ncclProfileColl) { + struct collective* parent = (struct collective *)eDescr->parentObj; + struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId]; + event->type = ncclProfileKernelCh; + event->channelId = eDescr->kernelCh.channelId; + event->parent = eventBase; + event->startTs = gettime() - startTime; + *eHandle = event; + __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED); + debugEvent(event, "KernelChStart"); + } else { // ncclProfileP2p + struct p2p* parent = (struct p2p *)eDescr->parentObj; + struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId]; + event->type = ncclProfileKernelCh; + event->channelId = eDescr->kernelCh.channelId; + event->parent = eventBase; + event->startTs = gettime() - startTime; + *eHandle = event; + __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED); + debugEvent(event, "KernelChStart"); + } + } else if (eDescr->type == ncclProfileNetPlugin) { + struct proxyStep* parent = (struct proxyStep *)eDescr->parentObj; + if (parent == NULL) return ncclSuccess; + + int64_t pluginId = eDescr->netPlugin.id; + int64_t type = pluginId & NCCL_PROFILER_NET_TYPE_MASK; + int64_t ver = pluginId & NCCL_PROFILER_NET_VER_MASK; + if (type == NCCL_PROFILER_NET_TYPE_IB) { + if (ver == 1) { + ncclProfilerNetIbDescr_v1_t* descr = (ncclProfilerNetIbDescr_v1_t *)eDescr->netPlugin.data; + struct netPlugin* event = parent->net + __atomic_fetch_add(&parent->nNetEvents, 1, __ATOMIC_RELAXED); + event->type = ncclProfileNetPlugin; + event->pluginType = type; + event->pluginVer = ver; + if (descr->type == ncclProfileQp) { + event->pluginEvent = ncclProfileQp; + event->qp.device = descr->qp.device; + event->qp.wr_id = descr->qp.wr_id; + event->qp.opcode = descr->qp.opcode; + event->qp.qpNum = descr->qp.qpNum; + event->qp.length = descr->qp.length; + } + event->startTs = gettime() - startTime; + *eHandle = event; + debugEvent(event, "NetPluginStart"); + } + } else if (type == NCCL_PROFILER_NET_TYPE_SOCK) { + if (ver == 1) { + ncclProfilerNetSockDescr_v1_t* descr = (ncclProfilerNetSockDescr_v1_t *)eDescr->netPlugin.data; + struct netPlugin* event = parent->net + __atomic_fetch_add(&parent->nNetEvents, 1, __ATOMIC_RELAXED); + event->type = ncclProfileNetPlugin; + event->pluginType = type; + event->pluginVer = ver; + if (descr->type == ncclProfileSocket) { + event->pluginEvent = ncclProfileSocket; + event->sock.fd = descr->sock.fd; + event->sock.op = descr->sock.op; + event->sock.length = descr->sock.length; + } + event->startTs = gettime() - startTime; + *eHandle = event; + debugEvent(event, "NetPluginStart"); + } + } } return ncclSuccess; } @@ -445,6 +515,15 @@ void updateEvent(void* handle) { struct proxyCtrl* event = (struct proxyCtrl *)handle; event->stopTs = gettime() - startTime; debugEvent(event, "ProxyCtrlStop"); + } else if (type == ncclProfileKernelCh) { + struct kernelCh* event = (struct kernelCh *)handle; + event->stopTs = gettime() - startTime; + updateEvent(event->parent); + debugEvent(event, "KernelChStop"); + } else if (type == ncclProfileNetPlugin) { + struct netPlugin* event = (struct netPlugin *)handle; + event->stopTs = gettime() - startTime; + debugEvent(event, "NetPluginStop"); } } @@ -506,7 +585,7 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile return ncclSuccess; } -ncclProfiler_t ncclProfiler_v2 = { +ncclProfiler_t ncclProfiler_v3 = { "Example-profiler", exampleProfilerInit, exampleProfilerStartEvent, @@ -514,3 +593,17 @@ ncclProfiler_t ncclProfiler_v2 = { exampleProfilerRecordEventState, exampleProfilerFinalize, }; + +int exampleProfilerStart(int eActivationMask) { + if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) { + __atomic_store_n(eActivationMaskPtr, eActivationMask, __ATOMIC_RELAXED); + } + return ncclSuccess; +} + +int exampleProfilerStop(void) { + if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) { + __atomic_store_n(eActivationMaskPtr, 0, __ATOMIC_RELAXED); + } + return ncclSuccess; +} diff --git a/ext-profiler/example/plugin.h b/ext-profiler/example/plugin.h new file mode 100644 index 000000000..b4d07060a --- /dev/null +++ b/ext-profiler/example/plugin.h @@ -0,0 +1,13 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PLUGIN_H_ +#define PLUGIN_H_ + +int exampleProfilerStart(int eActivationMask); +int exampleProfilerStop(void); + +#endif diff --git a/ext-profiler/example/print_event.c b/ext-profiler/example/print_event.c index f26a9eeb2..43f719045 100644 --- a/ext-profiler/example/print_event.c +++ b/ext-profiler/example/print_event.c @@ -72,7 +72,7 @@ __hidden void printProxyOpEventTrailer(FILE* fh, struct proxyOp* event) { } static __thread int proxyStepId; -__hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) { +__hidden void printProxyStepEventHeader(FILE* fh, struct proxyStep* event) { if (event->isSend) { fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", "SendBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step); @@ -84,8 +84,6 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) { "SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)]); fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", "SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)], event->step); - fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", - "SendWait", proxyStepId++, getpid(), 1, event->stopTs); } else { fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", "RecvBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step); @@ -93,6 +91,14 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) { "RecvBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)]); fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)], event->step); + } +} + +__hidden void printProxyStepEventTrailer(FILE* fh, struct proxyStep* event) { + if (event->isSend) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + "SendWait", proxyStepId++, getpid(), 1, event->stopTs); + } else { fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)]); fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", @@ -106,6 +112,19 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) { } } +static __thread int kernelId; +__hidden void printKernelChEventHeader(FILE* fh, struct kernelCh* event) { + if (event->type != ncclProfileKernelCh) return; + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d}},\n", + "KernelCh", kernelId, getpid(), 1, event->startTs, event->channelId); +} + +__hidden void printKernelChEventTrailer(FILE* fh, struct kernelCh* event) { + if (event->type != ncclProfileKernelCh) return; + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + "KernelCh", kernelId, getpid(), 1, event->stopTs); +} + static __thread int proxyCtrlId; __hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) { const char* str; @@ -127,6 +146,29 @@ __hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) { str, proxyCtrlId++, getpid(), 1, event->stopTs); } +static __thread int ibQpId, sockId; +__hidden void printNetPluginEvent(FILE* fh, struct netPlugin* event) { + if (event->pluginType == NCCL_PROFILER_NET_TYPE_IB) { + if (event->pluginVer == 1) { + if (event->pluginEvent == ncclProfileQp) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_IB\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"device\": %d, \"qp_num\": %d, \"opcode\": %d, \"wr_id\": %lu, \"size\": %lu}},\n", + "Qp", ibQpId, getpid(), 1, event->startTs, event->qp.device, event->qp.qpNum, event->qp.opcode, event->qp.wr_id, event->qp.length); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_IB\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + "Qp", ibQpId++, getpid(), 1, event->stopTs); + } + } + } else if (event->pluginType == NCCL_PROFILER_NET_TYPE_SOCK) { + if (event->pluginVer == 1) { + if (event->pluginEvent == ncclProfileSocket) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_SOCK\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"sock\": %d, \"op\": %d, \"size\": %lu}},\n", + "Sock", sockId, getpid(), 1, event->startTs, event->sock.fd, event->sock.op, event->sock.length); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_SOCK\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + "Sock", sockId++, getpid(), 1, event->stopTs); + } + } + } +} + //#define DEBUG_EVENTS void debugEvent(void* eHandle, const char* tag) { #ifdef DEBUG_EVENTS @@ -146,8 +188,10 @@ void debugEvent(void* eHandle, const char* tag) { fprintf(fh, "Collective event %p tag = %s {\n", event, tag); fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED)); fprintf(fh, " parent = %p\n", event->base.parent); - for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->send[i].type == ncclProfileProxyOp) fprintf(fh, " send[%d] = %p\n", i, &event->send[i]); - for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->recv[i].type == ncclProfileProxyOp) fprintf(fh, " recv[%d] = %p\n", i, &event->recv[i]); + for (int j = 0; j < MAX_OPS; j++) { + for (int i = 0; i < MAX_CHANNELS; i++) if (event->send[i][j].type == ncclProfileProxyOp) fprintf(fh, " send[%d] = %p\n", i, &event->send[i]); + for (int i = 0; i < MAX_CHANNELS; i++) if (event->recv[i][j].type == ncclProfileProxyOp) fprintf(fh, " recv[%d] = %p\n", i, &event->recv[i]); + } fprintf(fh, " startTs = %f\n", event->base.startTs); fprintf(fh, " stopTs = %f\n", event->base.stopTs); fprintf(fh, "}\n"); @@ -178,6 +222,20 @@ void debugEvent(void* eHandle, const char* tag) { fprintf(fh, " startTs = %f\n", event->startTs); fprintf(fh, " stopTs = %f\n", event->stopTs); fprintf(fh, "}\n"); + } else if (type == ncclProfileKernelCh) { + struct kernelCh* event = (struct kernelCh *)eHandle; + fprintf(fh, "KernelCh event %p tag = %s {\n", event, tag); + fprintf(fh, " parent = %p\n", event->parent); + fprintf(fh, " channel = %d\n", event->channelId); + } else if (type == ncclProfileNetPlugin) { + struct netPlugin* event = (struct netPlugin *)eHandle; + fprintf(fh, "NetPlugin event %p tag = %s {\n", event, tag); + fprintf(fh, " pluginType = %d\n", event->pluginType); + fprintf(fh, " pluginVer = %d\n", event->pluginVer); + fprintf(fh, " pluginEvent = %d\n", event->pluginEvent); + fprintf(fh, " startTs = %f\n", event->startTs); + fprintf(fh, " stopTs = %f\n", event->stopTs); + fprintf(fh, "}\n"); } fclose(fh); #endif @@ -200,17 +258,21 @@ void printEvent(FILE* fh, void* handle) { struct collective* c = (struct collective *)handle; printCollEventHeader(fh, c); for (int i = 0; i < MAX_CHANNELS; i++) { + printKernelChEventHeader(fh, &c->kernel[i]); for (int j = 0; j < c->nProxyOps[i]; j++) { printEvent(fh, &c->send[i][j]); printEvent(fh, &c->recv[i][j]); } + printKernelChEventTrailer(fh, &c->kernel[i]); } printCollEventTrailer(fh, c); } else if (type == ncclProfileP2p) { struct p2p* p = (struct p2p *)handle; printP2pEventHeader(fh, p); for (int i = 0; i < MAX_CHANNELS; i++) { + printKernelChEventHeader(fh, &p->kernel[i]); printEvent(fh, &p->op[i]); + printKernelChEventTrailer(fh, &p->kernel[i]); } printP2pEventTrailer(fh, p); } else if (type == ncclProfileProxyOp) { @@ -222,7 +284,11 @@ void printEvent(FILE* fh, void* handle) { printProxyOpEventTrailer(fh, p); } else if (type == ncclProfileProxyStep) { struct proxyStep* p = (struct proxyStep *)handle; - printProxyStepEvent(fh, p); + printProxyStepEventHeader(fh, p); + for (int q = 0; q < p->nNetEvents; q++) { + printNetPluginEvent(fh, &p->net[q]); + } + printProxyStepEventTrailer(fh, p); } else if (type == ncclProfileProxyCtrl) { struct proxyCtrl* p = (struct proxyCtrl *)handle; printProxyCtrlEvent(fh, p); From 0524aef7a0333bc79d885e392812519087eab71f Mon Sep 17 00:00:00 2001 From: Kamil Iskra Date: Tue, 22 Apr 2025 13:50:40 -0700 Subject: [PATCH 08/21] NCCL 2.26.3-1 Minimize the performance impact of the device kernel profiling support when the profiler plugin is not loaded. Reduce the overheads of CUDA graph capturing, which increased in NCCL 2.26.2 for large graphs. Fix the exchange of enhanced connection establishment (ECE) options to address potential slowdowns on networks utilizing RoCE. Test if cuMem host allocations work and if not, disable them. Enabled by default since NCCL 2.24 if the CUDA driver version is at least 12.6, such allocations rely on NUMA support, which is by default not available under Docker. We recommend invoking Docker with "--cap-add SYS_NICE" to enable it. Fix an initialization error when running with NCCL_NET_GDR_C2C=1 on multiple MNNVL domains with non-uniform network configurations across nodes. Fix the printing of sub-seconds in the debug log when using a custom NCCL_DEBUG_TIMESTAMP_FORMAT setting. --- makefiles/version.mk | 2 +- src/debug.cc | 6 +++- src/device/common.h | 57 +++++++++++++++++++++++++++++++++----- src/enqueue.cc | 27 ++++++++++++++---- src/graph/paths.cc | 6 ++-- src/include/device.h | 2 ++ src/include/graph.h | 2 +- src/include/profiler.h | 1 + src/include/proxy.h | 6 ++++ src/include/strongstream.h | 3 ++ src/misc/cudawrap.cc | 31 +++++++++++++++++++++ src/misc/strongstream.cc | 32 +++++++++++++++++++++ src/plugin/profiler.cc | 6 +++- src/proxy.cc | 20 +++++++------ src/transport/coll_net.cc | 2 +- src/transport/net.cc | 2 +- src/transport/net_ib.cc | 11 ++++---- 17 files changed, 182 insertions(+), 34 deletions(-) diff --git a/makefiles/version.mk b/makefiles/version.mk index df3ee5c68..93a71d49d 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 26 -NCCL_PATCH := 2 +NCCL_PATCH := 3 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/debug.cc b/src/debug.cc index 2eb8d7749..e2cc4f810 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -195,6 +195,10 @@ static void ncclDebugInit() { } } + // Replace underscore with spaces... it is hard to put spaces in command line parameters. + for (int i=0; ncclDebugTimestampFormat[i] != '\0'; ++i) { + if (ncclDebugTimestampFormat[i]=='_') ncclDebugTimestampFormat[i] = ' '; + } // Cache pid and hostname getHostName(hostname, 1024, '.'); @@ -301,7 +305,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file snprintf(localTimestampFormat + ncclDebugTimestampSubsecondsStart, ncclDebugTimestampSubsecondDigits+1, "%0*ld", ncclDebugTimestampSubsecondDigits, - ts.tv_nsec / (1000000UL/ncclDebugTimestampMaxSubseconds)); + ts.tv_nsec / (1000000000UL/ncclDebugTimestampMaxSubseconds)); strcpy( localTimestampFormat+ncclDebugTimestampSubsecondsStart+ncclDebugTimestampSubsecondDigits, ncclDebugTimestampFormat+ncclDebugTimestampSubsecondsStart+ncclDebugTimestampSubsecondDigits); } diff --git a/src/device/common.h b/src/device/common.h index 2dca70dc2..855db730f 100644 --- a/src/device/common.h +++ b/src/device/common.h @@ -54,6 +54,7 @@ struct ncclShmemData { int workSize; uint32_t workConsumed; uint64_t workCounter; + bool profilerEnabled; struct ncclShmemGroup groups[NCCL_MAX_GROUPS]; uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1]; @@ -291,6 +292,48 @@ struct RunWorkBatch { } }; +#define START 0 +#define STOP 1 +#define FINI 2 + +__device__ __forceinline__ bool profilerEnabled(void) { + // Check if any of the workItems in the batch is profiled. If so, there is an equivalent + // profiler ProxyOp waiting for the counter update in the host thread. If this check was + // done only for the first workItem the profiler counter for other workItems in the batch + // could never be updated, leaving the host thread spinning forever for the counter update + // and causing a hang. + bool enabled = false; + for (int i = 0; i < ncclShmem.nWorks && !enabled; i++) { + if (ncclShmem.workType == ncclDevWorkTypeP2p) + enabled = ((struct ncclDevWorkP2p*)ncclShmem.workStorage)[i].profilerEnabled; + else + enabled = ((struct ncclDevWorkColl*)ncclShmem.workStorage)[i].profilerEnabled; + } + return enabled; +} + +__device__ __forceinline__ void profiler(int action) { + if (action == START) { + if (threadIdx.x == 0) { + // increment workCounter regardless of the profiler being active or not + ncclShmem.channel.workCounter += ncclShmem.nWorks; + if(!profilerEnabled()) return; + ncclShmem.comm.workStarted[ncclShmem.channelId] = ncclShmem.channel.workCounter; + } + } else if (action == STOP) { + if (threadIdx.x == 0 && profilerEnabled()) { + ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter; + } + } else { // FINI + if (threadIdx.x == 0) { + // store the workCounter back to vidmem regardless of the profiler being active or not + ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter; + if (!profilerEnabled()) return; + ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter; + } + } +} + template __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* args) { int tid = threadIdx.x; @@ -312,7 +355,10 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a } __syncthreads(); // publish ncclShmem.{args, channelId} /* set abort flag to 0 */ - if (tid == 0) ncclShmem.aborted = 0; + if (tid == 0) { + ncclShmem.aborted = 0; + ncclShmem.channel.workCounter = ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter; + } // Use first 2 warps to load comm and channel, and remaining load work batch. switch (tid/WARP_SIZE) { @@ -348,7 +394,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a } while (ncclShmem.aborted == 0) { - if (tid == 0) ncclShmem.comm.workStarted[ncclShmem.channelId] = (ncclShmem.channel.workCounter += ncclShmem.nWorks); + profiler(START); if (0 <= SpecializedFnId && ncclShmem.funcId == (unsigned)SpecializedFnId) { SpecializedRunWorkBatch().run(); } else { @@ -358,7 +404,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a if (ncclShmem.nextBatchIx == -1) break; int batchIx = ncclShmem.nextBatchIx; __syncthreads(); - if (tid == 0) ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter; + profiler(STOP); loadWorkBatchToShmem(tid, tn, args, batchIx); __syncthreads(); @@ -367,10 +413,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed; } } - if (tid == 0) { - ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter; - ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter; - } + profiler(FINI); } __global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K); diff --git a/src/enqueue.cc b/src/enqueue.cc index 5e0b213fc..4e8a211fc 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -288,6 +288,7 @@ ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) { devWork.oneNode = (comm->nNodes == 1); devWork.isOneRPN = comm->isOneRPN; devWork.netRegUsed = devWork.regUsed = 0; + devWork.profilerEnabled = ncclProfilerPluginLoaded() && (task->eActivationMask & ncclProfileKernelCh); if (task->regBufType & NCCL_NET_REG_BUFFER) devWork.netRegUsed = 1; if (task->regBufType & (NCCL_IPC_REG_BUFFER | NCCL_NVLS_REG_BUFFER)) @@ -445,6 +446,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr; devWork.oneNode = (comm->nNodes == 1); devWork.netRegUsed = devWork.regUsed = 0; + devWork.profilerEnabled = ncclProfilerPluginLoaded() && (task->eActivationMask & ncclProfileKernelCh); if (task->regBufType & NCCL_NET_REG_BUFFER) devWork.netRegUsed = 1; if (task->regBufType & (NCCL_IPC_REG_BUFFER | NCCL_NVLS_REG_BUFFER)) @@ -557,7 +559,7 @@ static ncclResult_t scheduleCollTasksToPlan( proxyOp.task.coll = task; proxyOp.rank = comm->rank; proxyOp.eActivationMask = task->eActivationMask; - proxyOp.workCounter = ++comm->profiler.workCounter[c]; + proxyOp.incWorkCounter = true; addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes); // Set pattern to profiler to add a proxy profiler for kernel events NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp)); @@ -681,7 +683,7 @@ static ncclResult_t scheduleCollTasksToPlan( proxyOp->ringAlgo->incRefCount(); } proxyOp->eActivationMask = task->eActivationMask; - proxyOp->workCounter = ++comm->profiler.workCounter[c]; + proxyOp->incWorkCounter = true; addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes); // Coverity reports "proxyOp->connection" as being possibly uninitialized. It's hard to // determine if that's actually true but it's also not clear if that would be an issue. @@ -886,6 +888,7 @@ static ncclResult_t addP2pToPlan( work->recvRank = recvRank; work->recvAddr = recvAddr; work->recvBytes = recvBytes==-1 ? 0 : recvBytes; + work->profilerEnabled = ncclProfilerPluginLoaded() && ((p2pTasks[0] ? p2pTasks[0] : p2pTasks[1])->eActivationMask & ncclProfileKernelCh); struct ncclProxyOp proxyOps[2] = {}; int nProxyOps = selfSend ? 0 : 2; @@ -910,6 +913,7 @@ static ncclResult_t addP2pToPlan( nChannelsMax = std::max(nChannels[0], nChannels[1]); for (int part=0; part < nChannelsMax; part++) { + int incWorkCounter = -1; int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part); plan->channelMask |= uint64_t(1)< pair rather than individual p2p + if (proxyOps[dir].nsteps && incWorkCounter < 0) { + proxyOps[dir].incWorkCounter = true; + incWorkCounter = dir; + } + if (proxyOps[dir].nsteps != 0) { // Calculate the opCount after adding batch since then the batch count will // equal one plus the batch index this p2p settled in. proxyOps[dir].channelId = channelId; proxyOps[dir].opCount = uint64_t(comm->planner.wipPlan.channels[channelId].nWorkBatchesP2p)<<1 | 1; - proxyOps[dir].workCounter = comm->profiler.workCounter[channelId]+1; NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOps[dir])); NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOps[dir])); } } - comm->profiler.workCounter[channelId] += (proxyOps[0].nsteps || proxyOps[1].nsteps) ? 1 : 0; } return ncclSuccess; @@ -1592,7 +1600,16 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { CUDACHECK(cudaEventRecord(comm->sharedRes->scratchEvent, launchStream)); // deviceStream waits on userStream[0] NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream)); - CUDACHECK(cudaStreamWaitEvent(deviceStream, comm->sharedRes->scratchEvent, 0)); + + // We know that deviceStream is strictly behind the launchStream because launchStream + // synced with it before kernel launch. This allows us to to see deviceStream waiting + // on launchStream as a fast-forward. When building CUDA graphs fast forwards should + // be handled specially so as not to create graphs with a blowup in the number of edges. + // So we could do this: + // CUDACHECK(cudaStreamWaitEvent(deviceStream, comm->sharedRes->scratchEvent, 0)); + // But instead we do: + NCCLCHECK(ncclStreamAdvanceToEvent(planner->capturingGraph, deviceStream, comm->sharedRes->scratchEvent)); + // Each userStream[i] waits on userStream[0] for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) { CUDACHECK(cudaStreamWaitEvent(l->stream, comm->sharedRes->scratchEvent, 0)); diff --git a/src/graph/paths.cc b/src/graph/paths.cc index ace4476f6..998371247 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -367,7 +367,7 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess; if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) && (fabricInfo1->cliqueId == fabricInfo2->cliqueId)) { - INFO(NCCL_NET, "MNNVL matching peer 0x%lx UUID %lx.%lx cliqueId 0x%x", + TRACE(NCCL_NET, "MNNVL matching peer 0x%lx UUID %lx.%lx cliqueId 0x%x", info2->busId, ((long *)fabricInfo2->clusterUuid)[0], ((long *)fabricInfo2->clusterUuid)[1], fabricInfo2->cliqueId); *ret = 1; } @@ -473,7 +473,7 @@ ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *a NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 0); // Determine whether we need to flush the GDR recv buffers -ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush) { +ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int64_t netId, int netDev, int rank, int* flush) { *flush = 1; ncclNetProperties_t props; NCCLCHECK(comm->ncclNet->getProperties(netDev, &props)); @@ -488,7 +488,7 @@ ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* // flags would go through C2C. In that case, force a flush. int c, n; NCCLCHECK(ncclGetLocalCpu(system, g, &c)); - NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n)); + NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n)); if (gpu->paths[NET][n].type <= PATH_PXB && gpu->paths[CPU][c].type == PATH_C2C) { *flush = 1; } diff --git a/src/include/device.h b/src/include/device.h index 0763a579a..f6ca51b75 100644 --- a/src/include/device.h +++ b/src/include/device.h @@ -221,6 +221,7 @@ struct alignas(16) ncclDevWorkP2p { uint8_t sendProtoLL:1, recvProtoLL:1; uint8_t sendNetReg:1, recvNetReg:1; uint8_t sendIpcReg:1, recvIpcReg:1; + uint8_t profilerEnabled:1; }; // Compute the subset of the data transfer corresponding to the given part index. @@ -259,6 +260,7 @@ struct alignas(16) ncclDevWorkColl { uint32_t channelLo:8, channelHi:8; uint32_t nWarps:8; uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1; + uint32_t profilerEnabled:1; uint32_t root; void* recvbuff; void* sendbuff; diff --git a/src/include/graph.h b/src/include/graph.h index b779773da..a06556e37 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -43,7 +43,7 @@ enum ncclTopoGdrMode { ncclTopoGdrModeNum = 3 }; ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode); -ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush); +ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int64_t netId, int netDev, int rank, int* flush); ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail); ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net); int ncclPxnDisable(struct ncclComm* comm); diff --git a/src/include/profiler.h b/src/include/profiler.h index 8d4107963..bae0501bb 100644 --- a/src/include/profiler.h +++ b/src/include/profiler.h @@ -68,6 +68,7 @@ ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, n // Profiler utility functions ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op); bool ncclProfilerNeedsProxy(struct ncclComm* comm, struct ncclProxyOp* op); +bool ncclProfilerPluginLoaded(void); // Profiler callback for network plugin ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData); diff --git a/src/include/proxy.h b/src/include/proxy.h index 225acb22d..f90c80275 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -88,6 +88,12 @@ struct ncclProxyOp { struct ncclTaskP2p* p2p; } task; + // Profiler work counter increment flag. Set to 'true' if the profiler work counter for this channel needs increment. + // Always 'true' for collective operations. Grouped p2p operations are fused into one pair in the GPU kernel, + // meaning the GPU profiler code increments the work counter for the pair rather than the individual p2p. For this + // reason, the incWorkCounter flag is used to avoid incrementing the work counter twice in the host code. This is done + // by setting incWorkCounter to 'true' only for one of the p2ps in the pair during enqueue. + bool incWorkCounter; int eActivationMask; void* taskEventHandle; int rank; diff --git a/src/include/strongstream.h b/src/include/strongstream.h index c56d5aca5..393a1f0b1 100644 --- a/src/include/strongstream.h +++ b/src/include/strongstream.h @@ -102,6 +102,9 @@ ncclResult_t ncclStreamWaitStream( cudaStream_t a, cudaStream_t b, cudaEvent_t scratchEvent ); +// Like cudaStreamWaitEvent except `e` must be strictly ahead of everything in `s`. +ncclResult_t ncclStreamAdvanceToEvent(struct ncclCudaGraph g, cudaStream_t s, cudaEvent_t e); + // Synchrnoization does not need the strong stream to be acquired. ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss); diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc index e5fec1e46..64a84f556 100644 --- a/src/misc/cudawrap.cc +++ b/src/misc/cudawrap.cc @@ -4,6 +4,7 @@ * See LICENSE.txt for license information ************************************************************************/ +#include "alloc.h" #include "nccl.h" #include "debug.h" #include "param.h" @@ -67,6 +68,36 @@ int ncclCuMemHostEnable() { ncclCumemHostEnable = paramValue; else ncclCumemHostEnable = (cudaDriverVersion >= 12060) ? 1 : 0; + if (ncclCumemHostEnable) { + // Verify that host allocations actually work. Docker in particular is known to disable "get_mempolicy", + // causing such allocations to fail (this can be fixed by invoking Docker with "--cap-add SYS_NICE"). + int cudaDev; + CUdevice currentDev; + int cpuNumaNodeId = -1; + CUmemAllocationProp prop = {}; + size_t granularity = 0; + size_t size; + CUmemGenericAllocationHandle handle; + CUDACHECK(cudaGetDevice(&cudaDev)); + CUCHECK(cuDeviceGet(¤tDev, cudaDev)); + CUCHECK(cuDeviceGetAttribute(&cpuNumaNodeId, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID, currentDev)); + if (cpuNumaNodeId < 0) cpuNumaNodeId = 0; + prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.requestedHandleTypes = ncclCuMemHandleType; + prop.location.id = cpuNumaNodeId; + CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + size = 1; + ALIGN_SIZE(size, granularity); + if (CUPFN(cuMemCreate(&handle, size, &prop, 0)) != CUDA_SUCCESS) { + INFO(NCCL_INIT, "cuMem host allocations do not appear to be working; falling back to a /dev/shm/ based " + "implementation. This could be due to the container runtime disabling NUMA support. " + "To disable this warning, set NCCL_CUMEM_HOST_ENABLE=0"); + ncclCumemHostEnable = 0; + } else { + CUCHECK(cuMemRelease(handle)); + } + } } return ncclCumemHostEnable; error: diff --git a/src/misc/strongstream.cc b/src/misc/strongstream.cc index e6cce9807..7d957d432 100644 --- a/src/misc/strongstream.cc +++ b/src/misc/strongstream.cc @@ -328,6 +328,38 @@ ncclResult_t ncclStreamWaitStream(cudaStream_t a, cudaStream_t b, cudaEvent_t sc return ncclSuccess; } +ncclResult_t ncclStreamAdvanceToEvent(struct ncclCudaGraph g, cudaStream_t s, cudaEvent_t e) { + if (g.graphId == ULLONG_MAX) { + CUDACHECK(cudaStreamWaitEvent(s, e, 0)); + } else { + cudaStream_t tmp; + CUDACHECK(cudaStreamCreateWithFlags(&tmp, cudaStreamNonBlocking)); + CUDACHECK(cudaStreamWaitEvent(tmp, e, 0)); + + cudaStreamCaptureStatus status; + cudaGraphNode_t const* nodes; + size_t count = 0; + cudaError_t res = cudaStreamGetCaptureInfo_v2(tmp, &status, nullptr, nullptr, &nodes, &count); + + #if CUDART_VERSION >= 12030 + if (res == cudaErrorLossyQuery) { // CUDA is telling us the dependencies have edge annotations. + cudaGraphEdgeData const* edges; + CUDACHECK(cudaStreamGetCaptureInfo_v3(tmp, &status, nullptr, nullptr, &nodes, &edges, &count)); + CUDACHECK(cudaStreamUpdateCaptureDependencies_v2(s, (cudaGraphNode_t*)nodes, edges, count, cudaStreamSetCaptureDependencies)); + } + #else + if (false) {} + #endif + else { + CUDACHECK(res /* = cudaStreamGetCaptureInfo_v2(...)*/); + CUDACHECK(cudaStreamUpdateCaptureDependencies(s, (cudaGraphNode_t*)nodes, count, cudaStreamSetCaptureDependencies)); + } + + CUDACHECK(cudaStreamDestroy(tmp)); + } + return ncclSuccess; +} + ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss) { #if CUDART_VERSION >= 11030 CUDACHECK(cudaStreamWaitEvent(ss->liveStream, ss->serialEvent, 0)); diff --git a/src/plugin/profiler.cc b/src/plugin/profiler.cc index 023a704f4..18b9b5c4f 100644 --- a/src/plugin/profiler.cc +++ b/src/plugin/profiler.cc @@ -536,11 +536,15 @@ static ncclResult_t proxyProfilerConnect(struct ncclComm* comm, struct ncclProxy } bool ncclProfilerNeedsProxy(struct ncclComm* comm, struct ncclProxyOp* op) { - bool enabled = (__builtin_expect(ncclProfiler != NULL, 0) && (op->eActivationMask & ncclProfileKernelCh)); + bool enabled = ncclProfilerPluginLoaded() && (op->eActivationMask & ncclProfileKernelCh); if (enabled && !comm->profiler.initialized) (void)proxyProfilerConnect(comm, op); return enabled; } +bool ncclProfilerPluginLoaded(void) { + return (__builtin_expect(ncclProfiler != NULL, 0)); +} + ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData) { if (__builtin_expect(ncclProfiler != NULL, 0)) { struct ncclProxySubArgs* sub = (struct ncclProxySubArgs*)pHandle; diff --git a/src/proxy.cc b/src/proxy.cc index 7e8021e47..c27d23455 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -9,7 +9,6 @@ #include "collectives.h" #include "socket.h" #include "shmutils.h" -#include "profiler.h" #define ENABLE_TIMER 0 #include "timer.h" #include "profiler.h" @@ -533,15 +532,21 @@ static ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyCon return ncclSuccess; } +static void incWorkCounter(struct ncclComm* comm, struct ncclProxyOp* op) { + op->workCounter = (op->incWorkCounter) ? ++comm->profiler.workCounter[op->channelId] : comm->profiler.workCounter[op->channelId]; +} + static ncclResult_t SaveProxyProfiler(struct ncclComm* comm, struct ncclProxyOp* op, bool* justInquire) { struct ncclProxyConnector* proxyConn = (op->coll == ncclFuncRecv) ? &comm->profiler.recvProxyConn[op->channelId] : &comm->profiler.sendProxyConn[op->channelId]; - if (justInquire) *justInquire = true; - else { + if (justInquire) { + *justInquire = true; + if (!comm->planner.persistent) incWorkCounter(comm, op); + } else { op->sendbuff = (uint8_t *)comm->profiler.workStarted; op->recvbuff = (uint8_t *)comm->profiler.workCompleted; - NCCLCHECK(ncclLocalOpAppend(comm, proxyConn, op)); // Ensure that in graph capturing the proxy workCounter is incremented to keep up with kernel workCounter - op->workCounter += comm->profiler.workCounter[op->channelId]; + if (comm->planner.persistent) incWorkCounter(comm, op); + NCCLCHECK(ncclLocalOpAppend(comm, proxyConn, op)); } return ncclSuccess; } @@ -696,9 +701,8 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool NCCLCHECK(SaveProxy(comm, channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, 1, justInquire)); } break; case ncclPatternProfiler: { - if (ncclProfilerNeedsProxy(comm, op)) { - NCCLCHECK(SaveProxyProfiler(comm, op, justInquire)); - } + if (ncclProfilerNeedsProxy(comm, op)) NCCLCHECK(SaveProxyProfiler(comm, op, justInquire)); + else incWorkCounter(comm, op); } break; } return ncclSuccess; diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc index c1ccfcaa8..84e1f84a0 100644 --- a/src/transport/coll_net.cc +++ b/src/transport/coll_net.cc @@ -192,7 +192,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 0, &req.useGdr)); recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; // Determine whether we need to flush the GDR buffer on recv or not - if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, req.netDev, myInfo->rank, &req.needFlush)); + if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, netId, req.netDev, myInfo->rank, &req.needFlush)); recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank]; NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn)); diff --git a/src/transport/net.cc b/src/transport/net.cc index 40d334fa7..61b15ce20 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -250,7 +250,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph if (!req.useGdr && connIndex == 0) comm->useGdr = 0; // Determine whether we need to flush the GDR buffer on recv or not - if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, req.netDev, myInfo->rank, &req.needFlush)); + if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, netId, req.netDev, myInfo->rank, &req.needFlush)); // We don't support PXN on receive yet NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn)); diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index bfff6e555..c049531f8 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -1641,17 +1641,18 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle // However, this has been confirmed to be intentional. // coverity[copy_paste_error] NCCLCHECKGOTO(wrap_ibv_set_ece(qp->qp, &remMeta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail); - - // Query the reduced ece for this QP (matching enhancements between the requestor and the responder) - // Store this in our own qpInfo for returning to the requestor - if (meta.qpInfo[q].ece_supported) - NCCLCHECKGOTO(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail); } else { meta.qpInfo[q].ece_supported = 0; } NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, true, remMeta.tc, remMeta.sl), ret, fail); NCCLCHECKGOTO(ncclIbRtsQp(qp->qp), ret, fail); + + // Query the reduced ece for this QP (matching enhancements between the requestor and the responder) + // Store this in our own qpInfo for returning to the requestor + if (remMeta.qpInfo[q].ece_supported && meta.qpInfo[q].ece_supported) { + NCCLCHECKGOTO(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail); + } } rComm->flushEnabled = ((ncclIbGdrSupport() == ncclSuccess || ncclIbDmaBufSupport(lComm->dev) == ncclSuccess) From 3000e3c797b4b236221188c07aa09c1f3a0170d4 Mon Sep 17 00:00:00 2001 From: Kamil Iskra Date: Tue, 22 Apr 2025 13:55:13 -0700 Subject: [PATCH 09/21] NCCL 2.26.5-1 Work around a potential hang in alltoall-like communication patterns on MNNVL systems at a scale of over 80 ranks. --- makefiles/version.mk | 2 +- src/init.cc | 22 ++++++++++++++++------ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/makefiles/version.mk b/makefiles/version.mk index 93a71d49d..c5ed6ab70 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 26 -NCCL_PATCH := 3 +NCCL_PATCH := 5 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/init.cc b/src/init.cc index 46b02e65e..47d7fa3c6 100644 --- a/src/init.cc +++ b/src/init.cc @@ -271,7 +271,7 @@ NCCL_PARAM(DisableGraphHelper, "GRAPH_HELPER_DISABLE", 0); // GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1); #define NCCL_WORK_FIFO_BYTES_DEFAULT (1<<20) -NCCL_PARAM(WorkFifoBytes, "WORK_FIFO_BYTES", NCCL_WORK_FIFO_BYTES_DEFAULT); +NCCL_PARAM(WorkFifoBytes, "WORK_FIFO_BYTES", -1); NCCL_PARAM(WorkArgsBytes, "WORK_ARGS_BYTES", INT64_MAX); enum ncclLaunchMode ncclParamLaunchMode; @@ -458,12 +458,22 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { if (ccEnable) { comm->workFifoBytes = 0; } else { - comm->workFifoBytes = ncclParamWorkFifoBytes(); - if (0 != (comm->workFifoBytes & (comm->workFifoBytes-1))) { - WARN("NCCL_WORK_FIFO_BYTES=%d is being ignored because it is not a power of 2.", comm->workFifoBytes); - comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT; + int64_t workFifoBytesParam = ncclParamWorkFifoBytes(); + if (workFifoBytesParam == -1) { + if (comm->MNNVL && (comm->compCap >= 100)) { + // WAR: Disable work fifo for Blackwell all2all hang issue on MNNVL + INFO(NCCL_INIT, "Disabling work fifo"); + comm->workFifoBytes = 0; + } else { + comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT; + } + } else { + if (0 != (workFifoBytesParam & (workFifoBytesParam-1))) { + WARN("NCCL_WORK_FIFO_BYTES=%ld is being ignored because it is not a power of 2.", workFifoBytesParam); + comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT; + } + comm->workFifoBytes = std::min(workFifoBytesParam, 1ul<<30); } - comm->workFifoBytes = std::min(comm->workFifoBytes, 1u<<30); } if (comm->rank == 0) { From 8171af656bb3c47c8fc60b7cd49ae0c7494de664 Mon Sep 17 00:00:00 2001 From: Giuseppe Congiu Date: Mon, 19 May 2025 09:15:40 -0700 Subject: [PATCH 10/21] NCCL 2.26.6-1 Fix profiler_v2 compatibility layer * Removing trafficBytes in profiler_v3 breaks casting to ncclProfilerEventDescr_v2_t in the compatibility layer for profiler_v2 interface. This patch fixes the issue by making the conversion between the two descriptors explicit. --- makefiles/version.mk | 2 +- src/plugin/profiler/profiler_v2.cc | 50 +++++++++++++++++++++++++++--- 2 files changed, 47 insertions(+), 5 deletions(-) diff --git a/makefiles/version.mk b/makefiles/version.mk index c5ed6ab70..5c0b0de9a 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 26 -NCCL_PATCH := 5 +NCCL_PATCH := 6 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/plugin/profiler/profiler_v2.cc b/src/plugin/profiler/profiler_v2.cc index 3d00008a6..52907d6e3 100644 --- a/src/plugin/profiler/profiler_v2.cc +++ b/src/plugin/profiler/profiler_v2.cc @@ -12,11 +12,53 @@ static ncclProfiler_t ncclProfiler; static ncclProfiler_v2_t* ncclProfiler_v2; static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) { - if (eDescr->type == ncclProfileKernelCh || eDescr->type == ncclProfileNetPlugin) { - *eHandle = NULL; - return ncclSuccess; + *eHandle = nullptr; + ncclProfilerEventDescr_v2_t eDescr_v2 = { }; + eDescr_v2.type = eDescr->type; + eDescr_v2.parentObj = eDescr->parentObj; + eDescr_v2.rank = eDescr->rank; + switch(eDescr->type) { + case ncclProfileGroup: break; + case ncclProfileColl: { + eDescr_v2.coll.name = eDescr->coll.name; + eDescr_v2.coll.commHash = eDescr->coll.commHash; + eDescr_v2.coll.seqNumber = eDescr->coll.seqNumber; + eDescr_v2.coll.func = eDescr->coll.func; + eDescr_v2.coll.sendBuff = eDescr->coll.sendBuff; + eDescr_v2.coll.recvBuff = eDescr->coll.recvBuff; + eDescr_v2.coll.count = eDescr->coll.count; + eDescr_v2.coll.root = eDescr->coll.root; + eDescr_v2.coll.datatype = eDescr->coll.datatype; + eDescr_v2.coll.trafficBytes = 0; // removed in v3 + eDescr_v2.coll.nMaxChannels = eDescr->coll.nMaxChannels; + eDescr_v2.coll.nWarps = eDescr->coll.nWarps; + eDescr_v2.coll.algo = eDescr->coll.algo; + eDescr_v2.coll.proto = eDescr->coll.proto; + } break; + case ncclProfileP2p: { + eDescr_v2.p2p.name = eDescr->p2p.name; + eDescr_v2.p2p.commHash = eDescr->p2p.commHash; + eDescr_v2.p2p.func = eDescr->p2p.func; + eDescr_v2.p2p.buff = eDescr->p2p.buff; + eDescr_v2.p2p.count = eDescr->p2p.count; + eDescr_v2.p2p.datatype = eDescr->p2p.datatype; + eDescr_v2.p2p.peer = eDescr->p2p.peer; + } break; + case ncclProfileProxyOp: { + eDescr_v2.proxyOp.pid = eDescr->proxyOp.pid; + eDescr_v2.proxyOp.channelId = eDescr->proxyOp.channelId; + eDescr_v2.proxyOp.peer = eDescr->proxyOp.peer; + eDescr_v2.proxyOp.nSteps = eDescr->proxyOp.nSteps; + eDescr_v2.proxyOp.chunkSize = eDescr->proxyOp.chunkSize; + eDescr_v2.proxyOp.isSend = eDescr->proxyOp.isSend; + } break; + case ncclProfileProxyStep: { + eDescr_v2.proxyStep.step = eDescr->proxyStep.step; + } break; + case ncclProfileProxyCtrl: break; + default: return ncclSuccess; } - return ncclProfiler_v2->startEvent(context, eHandle, (ncclProfilerEventDescr_v2_t *)eDescr); + return ncclProfiler_v2->startEvent(context, eHandle, &eDescr_v2); } static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) { From 72d2432094d6ae36abd6e511c3a16a2d052dbf94 Mon Sep 17 00:00:00 2001 From: Kamil Iskra Date: Thu, 29 May 2025 20:56:40 -0700 Subject: [PATCH 11/21] NCCL 2.27.3-1 Symmetric memory API and symmetric kernels * Redesign from the ground up, enabling major latency and bandwidth improvements. * Add new API calls to register user-allocated memory among communicator ranks into a NCCL window: ncclCommWindowRegister() and ncclCommWindowDeregister(). The calls currently support symmetric registration for P2P and NVLS, and require VMM memory buffers (i.e., CUMEM must be operational). * Implement specialized kernels taking advantage of symmetrically registered memory, with performance gains expected particularly for small to medium message sizes. * The kernels support 32 bit floating point types and smaller, and sum as the reduction operator, with no more than one collective operation per group. * Floating point summation is always done in fp32 accumulators (with the exception of fp8 on NVLS, where it uses fp16 inside the switch). Thus, the accuracy with fp8 and fp16 data types should be much improved. * This initial implementation supports non-network communicators only (P2P and NVLS transports). * To explore this functionality users need to use the new memory registration API calls with the NCCL_WIN_COLL_SYMMETRIC flag and all ranks of a communicator must pass buffers at the same offset in the same registration when invoking a collective NCCL operation. Add support for DGX Spark. Add support for DirectNIC (CX8) to the internal IB plugin. Add a new ncclCommShrink() API call * It is a non-collective call similar to ncclCommSplit(), which makes it possible to exclude some (possibly unresponsive) ranks from the parent communicator. Add support for loading multiple network plugins * This enables the creation of generic containers that can work across a range of providers. * Allow NCCL_NET_PLUGIN to accept a comma-separated list of plugins to load. NVLink SHARP (NVLS) improvements * Implement NVLS+IB SHARP support for AllGather and ReduceScatter with user buffer registration. This improves performance and reduces the number of CTAs needed to achieve peak bandwidth. * Gracefully fall back by default to other transports if NVLS initialization fails (the old behavior of returning an error code from a NCCL call can be preserved by setting NCCL_NVLS_ENABLE=1). * Decrease the NVLS channel count to 24 on Blackwell systems with multiple NVLink domains per communicator. * Enable fine-tuning of NCCL behavior per communicator using new "ncclConfig_t" members "collnetEnable", "CTAPolicy", and "nvlsCTAs". Profiler improvements * Extend the init function by adding communicator name, comm id (hash), rank, number of ranks, number of nodes, and the NCCL log function to the argument list. This makes the name and the comm id available to all events in the communicator without explicitly passing them to each individual event. Add the communicator id and rank to the profiler trace filename. Now, the communicator name can be set via a new "ncclConfig_t" member "commName". * Improve the accuracy of the GPU kernel events by providing GPU-generated timestamps for the start and stop of every NCCL operation. * Harmonize proxy events, removing overlaps between ProxyOp and ProxyStep states. * Add support for network-defined event updates (through "recordEventState"). * Report the correct number of channels used by every collective/p2p operation (used to be set to nMaxChannels for collectives and absent for p2ps). * Fix the logic on proxyCtrl Idle/Active events (Issue #1162). * Fix an issue where the network proxy profiler could lose track of an event identifier (Issue #1682). * Improve the backward compatibility with plugins older than v4. * Ensure that the work counters are 0-initialized. * Fix a potential race condition in the network profiler that could result in an event being linked to a wrong parent. MNNVL improvements * Increase to 16 the number of NICs used to communicate between MNNVL domains on GB200 systems, to optimize the performance of collective operations. * Add support for more complex MNNVL topologies with up to 32 NICs per node. * If the MNNVL fabric initialization was unsuccessful, NCCL will now fail by default, so as to avoid inadvertently falling back to a potentially much slower network transport. Such failures are typically due to a misconfigured IMEX support on the system. To continue without MNNVL, restart the job with NCCL_MNNVL_ENABLE=0. * Fix a potential hang in alltoall-like communication patterns at a scale of over 80 ranks. * Make NCCL_P2P_DISABLE=1 imply NCCL_MNNVL_ENABLE=0 (so the latter no longer needs to be specified on MNNVL systems). * Fix an initialization failure when NCCL_TOPO_FILE is used on MNNVL systems. * Fix the graph search to exclude non-local NICs. * Fix the SHM transport to use fabric handles on MNNVL systems. NIC Fusion improvements * Disable the creation of fused NICs for physical devices that haven't been merged. * Flatten multiple ports to a single PCI device within the internal IB plugin and reparent dual-port NICs under the first PCI parent. If the parent is not a PCI switch, PCI devices for fused NICs won't be duplicated. * Route traffic on GB200-CX8 systems through DirectNIC, not the host interface. Improve support for platforms with C2C connectivity (e.g., GB200) * Enable GPUDirect RDMA for the NICs by default. * Add support for P2C (PXN over C2C) and the LL128 protocol. Extend NCCL fault tolerance in multithreaded scenarios * Support the creation of multiple nonblocking communicators within a single group and polling in parallel for the completion using multiple threads (one per communicator). Enable ncclImplicitOrderLaunch for CUDA 12.9+ * This can potentially speed up NCCL_IMPLICIT_LAUNCH_ORDER. Improve the netSocket transport latency and control * Provide finer control over the size of the socket send/receive buffers, the task size, and the number of sockets that a single peer can open. * Add support for the inlining of small messages behind the header when using multiple sockets per connection. Improve the readability of the CPU affinity in the debug output * Print it as a range string rather than a bitmask. Fix a potential race condition in graph execution * A contention could arise when mixing graph and non-graph execution. Improve PXN connection code * Avoid duplicate and unused connections. RAS fixes * Fix a memory corruption at job termination time in case of a previously failed initialization of a RAS socket connection. * Fix a race condition leading to a crash when generating a RAS report during communicator initialization (Issues #1669, #1718). * Fix a potential race condition when gathering data for a RAS status report. Fix a potential memory corruption in ncclCommSplit() * Memory could get corrupted when resource sharing was in use and the size of the NVLink domain in the new communicator was smaller than in the old one. Fix asynchronous graph upload * Fix a small memory leak. * Fix oversychronization. Add a check for out-of-memory conditions in ncclMemAlloc() Clean up the NCCL socket code * accept() will retry also if just reading the magic failed (Issue #1613). * connect() will retry also if poll() did not return a POLLOUT event (Issue #1618). * Add error checking in a few instances (Issue #1539). * Fix the loop condition in ncclFindInterfaceMatchSubnet() (Issue #1574). * Clean up the debug output, downgrading WARN messages to INFO in non-critical cases, and printing the peer's address where relevant. Switch NCCL_DEBUG_FILE to line buffering * This should help avoid mixed-up partial output lines in multithreaded cases. Other minor fixes * Improve the checks for buffer overflows in the graph code (Issue #1585). * Extend logging and state clearing to all four events in the internal IB plugin (Issue #1650). * Fix the error path in case IB communication is not ready (Issue #1489). * Add ECE logging for IB fabric. * Fix various minor issues in the graph module (Issue #1635). * Clean up the debug output in the graph code, downgrading WARN messages to INFO in non-critical cases. * Add a missing argument to a directSend() call (Issue #1628). * Remove duplicate code in sendProxySetup() (Issue #1420). * Fix the order of arguments of cudaDeviceCanAccessPeer() (Issue #1507). * Fix compiler warnings with GCC 14. * Fix a typo in a comment (Issue #1236). --- ext-net/example/nccl/common.h | 6 + ext-net/example/nccl/net.h | 4 +- ext-profiler/README.md | 127 +++--- ext-profiler/example/event.h | 49 +-- ext-profiler/example/nccl/profiler.h | 54 ++- ext-profiler/example/nccl/profiler_v3.h | 5 - ext-profiler/example/nccl/profiler_v4.h | 123 ++++++ ext-profiler/example/plugin.c | 82 +++- ext-profiler/example/print_event.c | 77 ++-- ext-profiler/example/print_event.h | 3 + makefiles/common.mk | 31 +- makefiles/version.mk | 4 +- src/Makefile | 2 +- src/allocator.cc | 196 +++++++++ src/bootstrap.cc | 9 +- src/channel.cc | 2 +- src/debug.cc | 58 ++- src/device/Makefile | 45 ++- src/device/all_gather.h | 260 +++++++++--- src/device/all_reduce.h | 2 +- src/device/common.h | 68 ++-- src/device/generate.py | 2 +- src/device/op128.h | 99 ++++- src/device/prims_simple.h | 18 +- src/device/reduce_kernel.h | 445 ++++++++++++++------ src/device/reduce_scatter.h | 246 ++++++++--- src/device/symmetric/all_gather.cuh | 367 +++++++++++++++++ src/device/symmetric/all_reduce.cuh | 432 ++++++++++++++++++++ src/device/symmetric/generate.py | 294 ++++++++++++++ src/device/symmetric/kernel.cuh | 27 ++ src/device/symmetric/primitives.cuh | 420 +++++++++++++++++++ src/device/symmetric/reduce_scatter.cuh | 387 ++++++++++++++++++ src/enqueue.cc | 392 +++++++++++------- src/graph/connect.cc | 10 +- src/graph/paths.cc | 98 +++-- src/graph/search.cc | 43 +- src/graph/topo.cc | 134 +++--- src/graph/topo.h | 30 +- src/graph/tuning.cc | 92 +++-- src/graph/xml.cc | 37 +- src/graph/xml.h | 11 +- src/group.cc | 356 ++++++++++------ src/include/allocator.h | 13 + src/include/bitops.h | 186 +++++++-- src/include/comm.h | 59 ++- src/include/cpuset.h | 25 ++ src/include/cudawrap.h | 70 ++-- src/include/device.h | 49 ++- src/include/graph.h | 6 +- src/include/group.h | 68 ++-- src/include/mlx5/mlx5dvcore.h | 18 + src/include/mlx5/mlx5dvsymbols.h | 23 ++ src/include/mlx5/mlx5dvwrap.h | 41 ++ src/include/nccl_common.h | 14 +- src/include/net.h | 2 - src/include/nvtx.h | 3 +- src/include/nvtx_payload_schemas.h | 10 + src/include/plugin/nccl_net.h | 7 +- src/include/plugin/nccl_profiler.h | 54 ++- src/include/plugin/profiler/profiler_v4.h | 123 ++++++ src/include/profiler.h | 13 +- src/include/proxy.h | 12 +- src/include/register.h | 24 +- src/include/register_inline.h | 33 ++ src/include/socket.h | 6 +- src/include/symmetric.h | 90 +++++ src/include/transport.h | 18 +- src/include/utils.h | 6 + src/init.cc | 470 ++++++++++++---------- src/misc/cudawrap.cc | 145 +++---- src/misc/ibvwrap.cc | 4 + src/misc/mlx5dvsymbols.cc | 74 ++++ src/misc/mlx5dvwrap.cc | 75 ++++ src/misc/socket.cc | 168 +++++--- src/misc/strongstream.cc | 34 ++ src/mnnvl.cc | 9 +- src/nccl.h.in | 41 +- src/plugin/net.cc | 372 +++++++++-------- src/plugin/plugin_open.cc | 65 +-- src/plugin/profiler.cc | 91 ++--- src/plugin/profiler/profiler_v1.cc | 40 +- src/plugin/profiler/profiler_v2.cc | 32 +- src/plugin/profiler/profiler_v3.cc | 93 ++++- src/plugin/profiler/profiler_v4.cc | 21 + src/proxy.cc | 17 +- src/ras/collectives.cc | 14 +- src/ras/rasnet.cc | 30 +- src/register/coll_reg.cc | 43 +- src/register/register.cc | 140 ++++++- src/symmetric.cc | 296 ++++++++++++++ src/transport.cc | 11 +- src/transport/coll_net.cc | 16 +- src/transport/net.cc | 52 +-- src/transport/net_ib.cc | 300 +++++++++----- src/transport/net_socket.cc | 97 +++-- src/transport/nvls.cc | 260 +++++++++--- src/transport/p2p.cc | 93 ++++- src/transport/profiler.cc | 13 +- src/transport/shm.cc | 2 +- 99 files changed, 7216 insertions(+), 2022 deletions(-) create mode 100644 ext-profiler/example/nccl/profiler_v4.h create mode 100644 src/allocator.cc create mode 100644 src/device/symmetric/all_gather.cuh create mode 100644 src/device/symmetric/all_reduce.cuh create mode 100755 src/device/symmetric/generate.py create mode 100644 src/device/symmetric/kernel.cuh create mode 100644 src/device/symmetric/primitives.cuh create mode 100644 src/device/symmetric/reduce_scatter.cuh create mode 100644 src/include/allocator.h create mode 100644 src/include/mlx5/mlx5dvcore.h create mode 100644 src/include/mlx5/mlx5dvsymbols.h create mode 100644 src/include/mlx5/mlx5dvwrap.h create mode 100644 src/include/plugin/profiler/profiler_v4.h create mode 100644 src/include/register_inline.h create mode 100644 src/include/symmetric.h create mode 100644 src/misc/mlx5dvsymbols.cc create mode 100644 src/misc/mlx5dvwrap.cc create mode 100644 src/plugin/profiler/profiler_v4.cc create mode 100644 src/symmetric.cc diff --git a/ext-net/example/nccl/common.h b/ext-net/example/nccl/common.h index 912925225..5aec2f7bb 100644 --- a/ext-net/example/nccl/common.h +++ b/ext-net/example/nccl/common.h @@ -7,9 +7,15 @@ #ifndef COMMON_H_ #define COMMON_H_ +#include + typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); +enum { ncclProfilerNetEventStart = 0, ncclProfilerNetEventStop, ncclProfilerNetEventUpdate, ncclProfilerNetEventUpdateAndStop }; + +typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData); + #endif diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h index 85ea79ef7..4cc66915b 100644 --- a/ext-net/example/nccl/net.h +++ b/ext-net/example/nccl/net.h @@ -8,9 +8,9 @@ #include #include -#include "common.h" #include "err.h" #include "net_device.h" +#include "common.h" #define NCCL_NET_HANDLE_MAXSIZE 128 #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB @@ -23,8 +23,6 @@ // Maximum number of requests per comm object #define NCCL_NET_MAX_REQUESTS 32 -typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData); - #include "net_v10.h" #include "net_v9.h" #include "net_v8.h" diff --git a/ext-profiler/README.md b/ext-profiler/README.md index 2a4018c07..27bd4e25c 100644 --- a/ext-profiler/README.md +++ b/ext-profiler/README.md @@ -49,9 +49,9 @@ of newer ones. The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions from old API versions. It also provides error codes in `err.h`. -# API (v3) +# API (v4) -Below is the main `ncclProfiler_v3` struct. Each function is explained in later sections. +Below is the main `ncclProfiler_v4` struct. Each function is explained in later sections. ``` typedef struct { @@ -60,9 +60,15 @@ typedef struct { // init - initialize the profiler plugin // Input // - context : opaque profiler context object for separating profiler behavior across comms + // - commName : user assigned communicator name + // - commHash : communicator id + // - nNodes : number of nodes in communicator + // - nranks : number of ranks in communicator + // - rank : rank identifier in communicator + // - logfn : logger function // Output // - eActivationMask: bitmask of active events set by the plugin - ncclResult_t (*init)(void** context, int* eActivationMask); + ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn); // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset // Input @@ -70,7 +76,7 @@ typedef struct { // - eDescr : pointer to ncclProfilerEventDescr_t object // Output // - eHandle: return event handle for supplied event descriptor object - ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr); + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr); // stopEvent - stop/finalize an event inside and event set // Input @@ -82,13 +88,13 @@ typedef struct { // - eHandle : handle to event object created through startEvent // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition // - eState : event state transition - ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs); + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs); // finalize - finalize the profiler plugin // Input // - context: opaque profiler context object ncclResult_t (*finalize)(void* context); -} ncclProfiler_v3_t; +} ncclProfiler_v4_t; ``` ## Error codes @@ -147,8 +153,6 @@ typedef struct { int rank; // rank that generated the event union { struct { // collective events metadata - const char* name; // string containing name of the communicator - uint64_t commHash; // unique hash/id for the communicator uint64_t seqNumber; // sequence number of this collective operation in the communicator const char* func; // string containing name of the collective void const* sendBuff; // address of send buffer @@ -156,20 +160,19 @@ typedef struct { size_t count; // data count int root; // root rank const char* datatype; // string containing the name of the datatype - uint8_t nMaxChannels; // max number of channels for this collective + uint8_t nChannels; // number of channels for this collective uint8_t nWarps; // number of GPU warps for this collective const char* algo; // string containing name of the algorithm for this collective const char* proto; // string containing name of the protocol for this collective } coll; struct { // point-to-point events metadata - const char* name; - uint64_t commHash; const char* func; void* buff; const char* datatype; size_t count; int peer; // peer rank for this point-to-point + uint8_t nChannels; // number of channels for this p2p } p2p; struct { // proxyOp events metadata @@ -178,7 +181,7 @@ typedef struct { int peer; // peer rank int nSteps; // number of network transfers/steps required by the `ncclProxyOp` int chunkSize; // chunk size for this `ncclProxyOp` - int isSend; // set to 1 for sends and 0 for recvs + int isSend; // type of network operation } proxyOp; struct { // proxyStep events metadata @@ -187,6 +190,7 @@ typedef struct { struct { uint8_t channelId; // id of the channel used by the kernel + uint64_t ptimer; // kernel supplied timestamp } kernelCh; struct { @@ -194,7 +198,7 @@ typedef struct { void* data; // pointer to network plugin defined event } netPlugin; }; -} ncclProfilerEventDescr_v3_t; +} ncclProfilerEventDescr_v4_t; ``` NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`, @@ -212,45 +216,57 @@ handle after `eventStop` is undefined behavior. Some events can only be started and stopped. For example, `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`, cannot be updated through calls to `recordEventState`. -`ncclProfileProxyOp`, `ncclProfileProxyStep` and `ncclProfileProxyCtrl` can be updated through -calls to `recordEventState`. +`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileNetPlugin`, `ncclProfileKernelCh`, and +`ncclProfileProxyCtrl` can be updated through calls to `recordEventState`. -The state of proxy generated events can be updated, along with event attributes, using -`recordEventState`. These events can go through several states during their lifecycle. -The list of supported states for the proxy-defined events is reported below. +The state of these events can be updated, along with event attributes, using `recordEventState`. +These events can go through several states during their lifecycle. + +The list of supported states for the updatable events is reported below. ``` typedef enum { // ncclProfileProxyOp event states - ncclProfilerProxyOpSendPosted, // state marks the posting of send buffer to GPU for given network transfer/step - ncclProfilerProxyOpSendRemFifoWait, // state marks the waiting of CTS credits from peer rank - ncclProfilerProxyOpSendTransmitted, // state marks the sending of network transfer/step to peer rank - ncclProfilerProxyOpSendDone, // state marks the ending of network transfer/step - ncclProfilerProxyOpRecvPosted, // state marks the posting of recv to network for given network transfer/step - ncclProfilerProxyOpRecvReceived, // state marks the recving of network transfer/step from peer rank - ncclProfilerProxyOpRecvTransmitted, // state marks the ending of the network transfer/step - ncclProfilerProxyOpRecvDone, // state marks the consuming of data from GPU + ncclProfilerProxyOpSendPosted = 0, // deprecated in v4 + ncclProfilerProxyOpSendRemFifoWait = 1, // deprecated in v4 + ncclProfilerProxyOpSendTransmitted = 2, // deprecated in v4 + ncclProfilerProxyOpSendDone = 3, // deprecated in v4 + ncclProfilerProxyOpRecvPosted = 4, // deprecated in v4 + ncclProfilerProxyOpRecvReceived = 5, // deprecated in v4 + ncclProfilerProxyOpRecvTransmitted = 6, // deprecated in v4 + ncclProfilerProxyOpRecvDone = 7, // deprecated in v4 + ncclProfilerProxyOpInProgress_v4 = 19,// state marks transition of proxy op to progress // ncclProfileProxyStep event states - ncclProfilerProxyStepSendGPUWait, // state marks the waiting of send data from GPU for given network transfer/step - ncclProfilerProxyStepSendWait, // state marks the waiting of send data from network for given network transfer/step - ncclProfilerProxyStepRecvWait, // state marks the waiting of recv data from network for given network transfer/step - ncclProfilerProxyStepRecvFlushWait, // state marks the waiting of recv data flush to GPU for given network transfer/step - ncclProfilerProxyStepRecvGPUWait, // state marks the waiting of recv data consumption from GPU for given network transfer/step + ncclProfilerProxyStepSendGPUWait = 8, // state marks the waiting of send data from GPU for given network transfer/step + ncclProfilerProxyStepSendPeerWait_v4 = 20,// state marks the waiting of recv clear to send credits for given network transfer/step + ncclProfilerProxyStepSendWait = 9, // state marks the waiting of send data from network for given network transfer/step + ncclProfilerProxyStepRecvWait = 10,// state marks the waiting of recv data from network for given network transfer/step + ncclProfilerProxyStepRecvFlushWait = 11,// state marks the waiting of recv data flush to GPU for given network transfer/step + ncclProfilerProxyStepRecvGPUWait = 12,// state marks the waiting of recv data consumption from GPU for given network transfer/step // ncclProfileProxyCtrl event states - ncclProfilerProxyCtrlIdle, // state marks proxy progress thread idle - ncclProfilerProxyCtrlActive, // state marks proxy progress thread active - ncclProfilerProxyCtrlSleep, // state marks proxy progress thread sleeping - ncclProfilerProxyCtrlWakeup, // state marks proxy progress thread waking up - ncclProfilerProxyCtrlAppend, // state marks append of new network work item begin - ncclProfilerProxyCtrlAppendEnd, // state marks append of new network work item end -} ncclProfilerEventState_v3_t; + ncclProfilerProxyCtrlIdle = 13,// state marks proxy progress thread idle + ncclProfilerProxyCtrlActive = 14,// state marks proxy progress thread active + ncclProfilerProxyCtrlSleep = 15,// state marks proxy progress thread sleeping + ncclProfilerProxyCtrlWakeup = 16,// state marks proxy progress thread waking up + ncclProfilerProxyCtrlAppend = 17,// state marks append of new network work item begin + ncclProfilerProxyCtrlAppendEnd = 18,// state marks append of new network work item end + + // ncclProfileNetPlugin event states + ncclProfilerNetPluginUpdate = 21,// state marks update of network defined event + + // ncclProfileKernelCh event states + ncclProfilerKernelChStop = 22,// state marks stop of kernelCh event and timestamp update +} ncclProfilerEventState_v4_t; ``` `ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing network requests for the GPU kernel. ProxyOp events are generated for every active channel and -provide a summary of the activity of the proxy progress thread for that channel. +provide a summary of the activity of the proxy progress thread for that channel. Most of the +states for this event were duplicated with `ncclProfileProxyStep` events. Therefore, starting +with version 4 of the profiler interface these states have been deprecated. The same level of +information can still be obtained through the `ncclProfileProxyStep` events. `ncclProfileProxyStep` events are generated by the proxy progress thread while it is processing network requests for the GPU kernel. ProxyStep events describe individual network transfer in @@ -348,15 +364,22 @@ reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported ``` typedef union { - struct { // attributes to update for ncclProfileProxyOp events - size_t transSize; // data transferred thus far - int steps; // network transfer/steps processed thus far - } proxyOp; + struct { // attributes for update for ncclProfileProxyStep events + size_t transSize; // transfer size field for this proxy step + } proxyStep; - struct { // attributes to update for ncclProfileProxyCtrl + struct { // attributes to update for ncclProfileProxyCtrl events int appendedProxyOps; // number of appended proxy ops thus far } proxyCtrl; -} ncclProfilerEventStateArgs_v3_t; + + struct { // attributes to update for ncclProfileNetPlugin events + void* data; // network plugin opaque update data field + } netPlugin; + + struct { // attribute to update for ncclProfileKernelCh events + uint64_t pTimer; // timestamp provided by the NCCL kernel + } kernelCh; +} ncclProfilerEventStateArgs_v4_t; ``` The example profiler in `ext-profiler/example` contains details on how to capture and use the events above. @@ -396,12 +419,12 @@ ProxyCtrl event ## Profiling of collective and p2p operations The NCCL code is instrumented with profiler callbacks at different levels to capture start/stop of groups, -collective and point-to-point operations, as well as proxy progress activity. Due to the asynchronous nature +collective and point-to-point operations, as well as proxy, kernel and network activity. Due to the asynchronous nature of NCCL operations, events associated to collective and point-to-point operations are not easy to delimit precisely. For example, without both proxy and/or kernel activity it is impossible for the profiler to figure out when a collective operation completes. Therefore, `stopEvent` for collectives simply indicates to -the profiler that the collective has been enqueued. The profiler can leverage proxy event information, if -these are enabled, to estimate when the collective ends. In this case, the profiler can look at the `stopEvent` +the profiler that the collective has been enqueued. The profiler can leverage proxy and/or kernel event information, if +these are enabled, to estimate when the collective ends. For example, the profiler can look at the `stopEvent` call of the last `ncclProfileProxyOp` event to mark the completion of the associated collective event. This can be achieved by reference counting the collective event and letting calls to `startEvent` and `stopEvent` increment and decrement the reference counter, respectively. @@ -425,8 +448,14 @@ enqueue can be time stamped by the profiler (at start and stop) to reconstruct t collective. However, this time only represents the launch time of the collective and not the actual execution time. To reconstruct the execution time more accurately proxy and kernel events are provided. +With version 3 of the profiler interface network activity is no longer required to do intra-node profiling. Kernel events instrumentation leverages counters exposed by the kernel to the host and the proxy progress thread. Thus, the proxy progress thread infrastructure is shared between the network and the profiler. If the proxy is serving network requests the kernel profiling probing can be delayed, causing loss of accuracy. Similarly, if the CPU is under heavy load and the scheduling of the proxy progress thread is -delayed, a similar loss of accuracy can be encountered. Keep this in mind when using kernel events. +delayed, a similar loss of accuracy can be encountered. + +To mitigate this effect, with version 4 of the profiler NCCL uses a per-channel ring buffer of 64 elements. +Every counter is complemented by two timestamps (ptimers) supplied by the NCCL kernel (one for start and one +for stop of the operation in the kernel). NCCL propagates these timestamps to the profiler plugin that it can +convert them to CPU time domain. diff --git a/ext-profiler/example/event.h b/ext-profiler/example/event.h index 0638f2df1..4c1b8f53a 100644 --- a/ext-profiler/example/event.h +++ b/ext-profiler/example/event.h @@ -15,24 +15,6 @@ #define MAX_CHANNELS 32 #define MAX_STEPS 16 #define MAX_OPS 16 // Up to 64K ranks for PAT - -#define PROXY_OP_SEND_STATE_OFFSET (ncclProfilerProxyOpSendPosted) -#define PROXY_OP_RECV_STATE_OFFSET (ncclProfilerProxyOpRecvPosted) -#define PROXY_STEP_SEND_STATE_OFFSET (ncclProfilerProxyStepSendGPUWait) -#define PROXY_STEP_RECV_STATE_OFFSET (ncclProfilerProxyStepRecvWait) - -#define NUM_PROXY_OP_SEND_STATES (ncclProfilerProxyOpSendDone - ncclProfilerProxyOpSendPosted + 1) -#define NUM_PROXY_OP_RECV_STATES (ncclProfilerProxyOpRecvDone - ncclProfilerProxyOpRecvPosted + 1) -#define NUM_PROXY_STEP_SEND_STATES (ncclProfilerProxyStepSendWait - ncclProfilerProxyStepSendGPUWait + 1) -#define NUM_PROXY_STEP_RECV_STATES (ncclProfilerProxyStepRecvGPUWait - ncclProfilerProxyStepRecvWait + 1) - -#define PROXY_OP_SEND_STATE_IDX(state) (state - PROXY_OP_SEND_STATE_OFFSET) -#define PROXY_OP_RECV_STATE_IDX(state) (state - PROXY_OP_RECV_STATE_OFFSET) -#define PROXY_STEP_SEND_STATE_IDX(state) (state - PROXY_STEP_SEND_STATE_OFFSET) -#define PROXY_STEP_RECV_STATE_IDX(state) (state - PROXY_STEP_RECV_STATE_OFFSET) - -#define MAX_PROXY_OP_STATES ((NUM_PROXY_OP_SEND_STATES > NUM_PROXY_OP_RECV_STATES ) ? NUM_PROXY_OP_SEND_STATES : NUM_PROXY_OP_RECV_STATES) -#define MAX_PROXY_STEP_STATES ((NUM_PROXY_STEP_SEND_STATES > NUM_PROXY_STEP_RECV_STATES) ? NUM_PROXY_STEP_SEND_STATES : NUM_PROXY_STEP_RECV_STATES) #define MAX_EVENTS_PER_REQ (8) struct proxyOp; @@ -68,13 +50,24 @@ struct kernelCh { struct taskEventBase* parent; double startTs; double stopTs; + uint64_t startGpuClk; + uint64_t stopGpuClk; }; +#define PROXY_STEP_SEND_GPU_WAIT 0 +#define PROXY_STEP_SEND_PEER_WAIT 1 +#define PROXY_STEP_SEND_WAIT 2 +#define PROXY_STEP_RECV_WAIT 0 +#define PROXY_STEP_RECV_FLUSH_WAIT 1 +#define PROXY_STEP_RECV_GPU_WAIT 2 +#define PROXY_STEP_MAX_STATES 3 + struct proxyStep { uint8_t type; // type of event: network transfer + int state; int step; // network transfer id in given channel int isSend; // send/recv channel operation - double timestamp[MAX_PROXY_STEP_STATES]; + double timestamp[PROXY_STEP_MAX_STATES]; double startTs; double stopTs; struct proxyOp* parent; @@ -92,11 +85,8 @@ struct proxyOp { int chunkSize; // chunk size for this proxy operation int isSend; // send/recv channel operation size_t transSize; // transfer data size for this proxy operation - struct { - int steps; // completed steps for this proxy operation state - double timestamp; - } states[MAX_PROXY_OP_STATES]; double startTs; + double progrTs; // In progress state transition double stopTs; int stepCount; // last processed network operation for this proxy operation struct proxyStep step[MAX_STEPS]; // array of network transfer events @@ -119,8 +109,6 @@ struct proxyCtrl { struct taskEventBase { uint8_t type; // event type: collective/p2p int rank; // rank of the operation in NCCL communicator - const char* name; // FIXME: unused - uint64_t commHash; // communicator identifier const char* func; // ncclFunc* int refCount; // number of references for this operation struct group* parent; // parent event group @@ -137,12 +125,11 @@ struct collective { size_t count; int root; const char* datatype; - uint8_t nMaxChannels; + uint8_t nChannels; const char* algo; const char* proto; int nWarps; - struct proxyOp send[MAX_CHANNELS][MAX_OPS];// array of send proxy operation events - struct proxyOp recv[MAX_CHANNELS][MAX_OPS];// array of recv proxy operation events + struct proxyOp op[MAX_CHANNELS][2*MAX_OPS]; int nProxyOps[MAX_CHANNELS]; struct kernelCh kernel[MAX_CHANNELS]; }; @@ -154,6 +141,7 @@ struct p2p { size_t count; const char* datatype; int peer; + uint8_t nChannels; struct proxyOp op[MAX_CHANNELS]; struct kernelCh kernel[MAX_CHANNELS]; }; @@ -172,6 +160,11 @@ struct group { // arrays for different event objects struct context { + const char* commName; + uint64_t commHash; + int nranks; + int rank; + int groupPoolSize; int groupPoolBase; int groupPoolIndex; diff --git a/ext-profiler/example/nccl/profiler.h b/ext-profiler/example/nccl/profiler.h index d02202d51..c911426d9 100644 --- a/ext-profiler/example/nccl/profiler.h +++ b/ext-profiler/example/nccl/profiler.h @@ -25,42 +25,52 @@ enum { }; typedef enum { - ncclProfilerProxyOpSendPosted, - ncclProfilerProxyOpSendRemFifoWait, - ncclProfilerProxyOpSendTransmitted, - ncclProfilerProxyOpSendDone, - ncclProfilerProxyOpRecvPosted, - ncclProfilerProxyOpRecvReceived, - ncclProfilerProxyOpRecvTransmitted, - ncclProfilerProxyOpRecvDone, + ncclProfilerProxyOpSendPosted = 0, // deprecated in v4 + ncclProfilerProxyOpSendRemFifoWait = 1, // deprecated in v4 + ncclProfilerProxyOpSendTransmitted = 2, // deprecated in v4 + ncclProfilerProxyOpSendDone = 3, // deprecated in v4 + ncclProfilerProxyOpRecvPosted = 4, // deprecated in v4 + ncclProfilerProxyOpRecvReceived = 5, // deprecated in v4 + ncclProfilerProxyOpRecvTransmitted = 6, // deprecated in v4 + ncclProfilerProxyOpRecvDone = 7, // deprecated in v4 + ncclProfilerProxyOpInProgress_v4 = 19, /* Legacy proxy profiler states */ - ncclProfilerProxyStepSendGPUWait, - ncclProfilerProxyStepSendWait, - ncclProfilerProxyStepRecvWait, - ncclProfilerProxyStepRecvFlushWait, - ncclProfilerProxyStepRecvGPUWait, + ncclProfilerProxyStepSendGPUWait = 8, + ncclProfilerProxyStepSendPeerWait_v4 = 20, + ncclProfilerProxyStepSendWait = 9, + ncclProfilerProxyStepRecvWait = 10, + ncclProfilerProxyStepRecvFlushWait = 11, + ncclProfilerProxyStepRecvGPUWait = 12, /* Legacy proxy control states */ - ncclProfilerProxyCtrlIdle, - ncclProfilerProxyCtrlActive, - ncclProfilerProxyCtrlSleep, - ncclProfilerProxyCtrlWakeup, - ncclProfilerProxyCtrlAppend, - ncclProfilerProxyCtrlAppendEnd, + ncclProfilerProxyCtrlIdle = 13, + ncclProfilerProxyCtrlActive = 14, + ncclProfilerProxyCtrlSleep = 15, + ncclProfilerProxyCtrlWakeup = 16, + ncclProfilerProxyCtrlAppend = 17, + ncclProfilerProxyCtrlAppendEnd = 18, + + /* Network defined events states */ + ncclProfilerNetPluginUpdate = 21, + + /* Kernel event states */ + ncclProfilerKernelChStop = 22, } ncclProfilerEventState_t; typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t; typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t; typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t; +typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t; +#include "profiler_v4.h" #include "profiler_v3.h" #include "profiler_v2.h" #include "profiler_v1.h" #include "profiler_net.h" -typedef ncclProfiler_v3_t ncclProfiler_t; -typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t; -typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t; +typedef ncclProfiler_v4_t ncclProfiler_t; +typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t; +typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t; #endif // end include guard diff --git a/ext-profiler/example/nccl/profiler_v3.h b/ext-profiler/example/nccl/profiler_v3.h index c1f1b919f..377118532 100644 --- a/ext-profiler/example/nccl/profiler_v3.h +++ b/ext-profiler/example/nccl/profiler_v3.h @@ -111,9 +111,4 @@ typedef struct { ncclResult_t (*finalize)(void* context); } ncclProfiler_v3_t; -typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t; -typedef ncclProfilerEventState_v3_t ncclProfilerEventState_t; -typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t; -typedef ncclProfiler_v3_t ncclProfiler_t; - #endif diff --git a/ext-profiler/example/nccl/profiler_v4.h b/ext-profiler/example/nccl/profiler_v4.h new file mode 100644 index 000000000..489f264c4 --- /dev/null +++ b/ext-profiler/example/nccl/profiler_v4.h @@ -0,0 +1,123 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_V4_H_ +#define PROFILER_V4_H_ + +typedef struct { + uint8_t type; // event type descriptor: ncclProfileColl, ... + void* parentObj; // pointer to the profiler parent object (for coll is the group) + int rank; // originating rank + union { + struct { + uint64_t seqNumber; + const char* func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + const char* datatype; + uint8_t nChannels; + uint8_t nWarps; + const char* algo; + const char* proto; + } coll; + + struct { + const char* func; + void* buff; + const char* datatype; + size_t count; + int peer; + uint8_t nChannels; + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + + struct { + uint8_t channelId; + uint64_t pTimer; // start timestamp from GPU globaltimer + } kernelCh; + + struct { + int64_t id; + void* data; + } netPlugin; + }; +} ncclProfilerEventDescr_v4_t; + +typedef union { + struct { + size_t transSize; + } proxyStep; + + struct { + int appendedProxyOps; + } proxyCtrl; + + struct { + void* data; + } netPlugin; + + struct { + uint64_t pTimer; + } kernelCh; +} ncclProfilerEventStateArgs_v4_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // - commName : user assigned communicator name + // - commHash : communicator id + // - nNodes : number of nodes in communicator + // - nranks : number of ranks in communciator + // - rank : rank identifier in communicator + // - logfn : logger function + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v4_t; + +#endif diff --git a/ext-profiler/example/plugin.c b/ext-profiler/example/plugin.c index 08408dba7..e3f707a0a 100644 --- a/ext-profiler/example/plugin.c +++ b/ext-profiler/example/plugin.c @@ -38,6 +38,9 @@ static int detachPoolIndex; static int detachPoolDone; static struct proxyOp* detachPool; +ncclDebugLogger_t logFn; +#define INFO(FLAGS, ...) logFn(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) + static double freq = -1; __hidden void calibrate() { struct timeval tv; @@ -60,7 +63,7 @@ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; static pid_t pid; static int* eActivationMaskPtr; -__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) { +__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) { pthread_mutex_lock(&lock); if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) { // first thread initializes event mask, environment and detach pool @@ -106,6 +109,13 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) // pre-allocate memory for event object pools in dedicated profiler context struct context* ctx = (struct context *)calloc(1, sizeof(*ctx)); + ctx->commName = commName; + ctx->commHash = commHash; + ctx->nranks = nranks; + ctx->rank = rank; + logFn = logfn; + INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d", commName ? commName : "", commHash, nranks, rank); + ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool)); if (ctx->groupPool == NULL) goto fail; @@ -142,17 +152,16 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) __hidden ncclResult_t exampleProfilerFinalize(void* context) { FILE* fh = NULL; char filename[PATH_MAX] = { 0 }; - char hostname[64] = { 0 }; - gethostname(hostname, 64); + struct context* ctx = (struct context *)context; const char* dump = getenv("NCCL_PROFILE_DUMP_FILE"); if (dump) { - sprintf(filename, "%s-%s-%ld.txt", dump, hostname, syscall(SYS_gettid)); + sprintf(filename, "%s_%lu_%d.json", dump, ctx->commHash, ctx->rank); fh = fopen(filename, "w"); fprintf(fh, "[\n"); } + INFO(NCCL_INIT, "PROFILER/Plugin: finalize commName: %s commHash: %lu nranks: %d rank: %d", ctx->commName ? ctx->commName : "", ctx->commHash, ctx->nranks, ctx->rank); // print last N groups/collectives/p2ps - struct context* ctx = (struct context *)context; int start = (ctx->groupPoolIndex - groupPoolSize >= 0) ? ctx->groupPoolIndex - groupPoolSize : 0; int end = ctx->groupPoolIndex; for (int i = start; i < end; i++) { @@ -243,8 +252,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n event->base.type = ncclProfileColl; event->base.rank = eDescr->rank; - event->base.name = eDescr->coll.name; - event->base.commHash = eDescr->coll.commHash; event->base.func = eDescr->coll.func; event->base.startTs = gettime() - startTime; event->base.parent = parent; @@ -254,7 +261,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n event->count = eDescr->coll.count; event->root = eDescr->coll.root; event->datatype = eDescr->coll.datatype; - event->nMaxChannels = eDescr->coll.nMaxChannels; + event->nChannels = eDescr->coll.nChannels; event->nWarps = eDescr->coll.nWarps; event->algo = eDescr->coll.algo; event->proto = eDescr->coll.proto; @@ -281,8 +288,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n event->base.type = ncclProfileP2p; event->base.rank = eDescr->rank; - event->base.name = eDescr->p2p.name; - event->base.commHash = eDescr->p2p.commHash; event->base.func = eDescr->p2p.func; event->base.next = parent->eventHead; event->base.startTs = gettime() - startTime; @@ -291,6 +296,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n event->count = eDescr->p2p.count; event->datatype = eDescr->p2p.datatype; event->peer = eDescr->p2p.peer; + event->nChannels = eDescr->p2p.nChannels; *eHandle = event; // increment the group ref counter so the event will staty open taskEventQueueEnqueue(parent, (struct taskEventBase *)event); @@ -331,6 +337,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n event->isSend = eDescr->proxyOp.isSend; event->startTs = gettime() - startTime; event->parent = NULL; + event->stepCount = 0; *eHandle = event; debugEvent(event, "PxnProxyOpStart"); return ncclSuccess; @@ -339,9 +346,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n if (eventBase->type == ncclProfileColl) { struct collective* parent = (struct collective *)eDescr->parentObj; int channelId = eDescr->proxyOp.channelId; - struct proxyOp* event = (eDescr->proxyOp.isSend) ? - &parent->send[channelId][parent->nProxyOps[channelId]++] : - &parent->recv[channelId][parent->nProxyOps[channelId]++]; + struct proxyOp* event = &parent->op[channelId][parent->nProxyOps[channelId]++]; event->type = ncclProfileProxyOp; event->channelId = channelId; @@ -353,6 +358,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n event->isSend = eDescr->proxyOp.isSend; event->parent = eventBase; event->startTs = gettime() - startTime; + event->stepCount = 0; *eHandle = event; __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED); debugEvent(event, "ProxyOpStart"); @@ -370,6 +376,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n event->isSend = eDescr->proxyOp.isSend; event->parent = eventBase; event->startTs = gettime() - startTime; + event->stepCount = 0; *eHandle = event; __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED); debugEvent(event, "ProxyOpStart"); @@ -382,9 +389,10 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n int s = parent->stepCount++ % MAX_STEPS; struct proxyStep* event = &parent->step[s]; event->type = ncclProfileProxyStep; + event->state = 0; event->step = eDescr->proxyStep.step; - event->isSend = parent->isSend; event->parent = parent; + event->isSend = parent->isSend; event->startTs = gettime() - startTime; event->nNetEvents = 0; *eHandle = event; @@ -397,6 +405,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId]; event->type = ncclProfileKernelCh; event->channelId = eDescr->kernelCh.channelId; + event->startGpuClk = eDescr->kernelCh.pTimer; event->parent = eventBase; event->startTs = gettime() - startTime; *eHandle = event; @@ -407,6 +416,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId]; event->type = ncclProfileKernelCh; event->channelId = eDescr->kernelCh.channelId; + event->startGpuClk = eDescr->kernelCh.pTimer; event->parent = eventBase; event->startTs = gettime() - startTime; *eHandle = event; @@ -563,29 +573,57 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile // the event handle might be null if we run out of events if (eHandle == NULL) return ncclSuccess; - debugEvent(eHandle, "RecordEventState"); uint8_t type = *(uint8_t *)eHandle; if (type == ncclProfileProxyOp) { struct proxyOp* event = (struct proxyOp *)eHandle; - int steps = event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].steps; - if (eState == ncclProfilerProxyOpSendRemFifoWait && eStateArgs->proxyOp.steps == steps) return ncclSuccess; - event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].steps = eStateArgs->proxyOp.steps; - event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].timestamp = gettime() - startTime; - event->transSize = eStateArgs->proxyOp.transSize; + if (eState == ncclProfilerProxyOpInProgress_v4) { + event->progrTs = gettime() - startTime; + } } else if (type == ncclProfileProxyStep) { struct proxyStep* event = (struct proxyStep *)eHandle; - event->timestamp[event->isSend ? PROXY_STEP_SEND_STATE_IDX(eState) : PROXY_STEP_RECV_STATE_IDX(eState)] = gettime() - startTime; + struct proxyOp* parent = event->parent; + switch (eState) { + case ncclProfilerProxyStepSendGPUWait: + event->timestamp[PROXY_STEP_SEND_GPU_WAIT] = gettime() - startTime; + break; + case ncclProfilerProxyStepSendPeerWait_v4: + // do not update step event if in SendPeerWait + if (event->state == ncclProfilerProxyStepSendPeerWait_v4) break; + event->timestamp[PROXY_STEP_SEND_PEER_WAIT] = gettime() - startTime; + event->state = ncclProfilerProxyStepSendPeerWait_v4; + break; + case ncclProfilerProxyStepSendWait: + event->timestamp[PROXY_STEP_SEND_WAIT] = gettime() - startTime; + parent->transSize += eStateArgs->proxyStep.transSize; + break; + case ncclProfilerProxyStepRecvWait: + event->timestamp[PROXY_STEP_RECV_WAIT] = gettime() - startTime; + break; + case ncclProfilerProxyStepRecvFlushWait: + event->timestamp[PROXY_STEP_RECV_FLUSH_WAIT] = gettime() - startTime; + parent->transSize += eStateArgs->proxyStep.transSize; + break; + case ncclProfilerProxyStepRecvGPUWait: + event->timestamp[PROXY_STEP_RECV_GPU_WAIT] = gettime() - startTime; + break; + } } else if (type == ncclProfileProxyCtrl) { struct proxyCtrl* event = (struct proxyCtrl *)eHandle; if (eState == ncclProfilerProxyCtrlAppendEnd) { event->appended = eStateArgs->proxyCtrl.appendedProxyOps; } event->state = eState; + } else if (type == ncclProfileKernelCh) { + struct kernelCh* event = (struct kernelCh *)eHandle; + if (eState == ncclProfilerKernelChStop) { + event->stopGpuClk = eStateArgs->kernelCh.pTimer; + } } + debugEvent(eHandle, "RecordEventState"); return ncclSuccess; } -ncclProfiler_t ncclProfiler_v3 = { +ncclProfiler_t ncclProfiler_v4 = { "Example-profiler", exampleProfilerInit, exampleProfilerStartEvent, diff --git a/ext-profiler/example/print_event.c b/ext-profiler/example/print_event.c index 43f719045..a56106e10 100644 --- a/ext-profiler/example/print_event.c +++ b/ext-profiler/example/print_event.c @@ -27,8 +27,8 @@ __hidden void printGroupEventTrailer(FILE* fh, struct group* event) { static __thread int collId; __hidden void printCollEventHeader(FILE* fh, struct collective* event) { - fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nMaxChannels\": %d}},\n", - event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nMaxChannels); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nChannels\": %d}},\n", + event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.parent->ctx->commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nChannels); } __hidden void printCollEventTrailer(FILE* fh, struct collective* event) { @@ -38,8 +38,8 @@ __hidden void printCollEventTrailer(FILE* fh, struct collective* event) { static __thread int p2pId; __hidden void printP2pEventHeader(FILE* fh, struct p2p* event) { - fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": \"%s\"}},\n", - event->base.func, p2pId, getpid(), 1, event->base.startTs, event->base.commHash, event->base.rank, event->peer, event->count, event->datatype); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"nChannels\": %d}},\n", + event->base.func, p2pId, getpid(), 1, event->base.startTs, event->base.parent->ctx->commHash, event->base.rank, event->peer, event->count, event->datatype, event->nChannels); } __hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) { @@ -50,47 +50,43 @@ __hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) { static __thread int proxyOpId; __hidden void printProxyOpEventHeader(FILE* fh, struct proxyOp* event) { if (event->isSend) { - int posted = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendPosted); - int remFifoWait = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendRemFifoWait); - int transmitted = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendTransmitted); - int done = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendDone); - fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu, \"POSTED\": {\"step\": %d, \"ts\": %f}, \"REM_FIFO_WAIT\": {\"step\": %d, \"ts\": %f}, \"TRANSMITTED\": {\"step\": %d, \"ts\": %f}, \"DONE\": {\"step\": %d, \"ts\": %f}}},\n", - "Send", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize, event->states[posted].steps, event->states[posted].timestamp, event->states[remFifoWait].steps, event->states[remFifoWait].timestamp, event->states[transmitted].steps, event->states[transmitted].timestamp, event->states[done].steps, event->states[done].timestamp); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu}},\n", + "ScheduleSend", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + "ScheduleSend", proxyOpId, getpid(), 1, event->progrTs); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu}},\n", + "ProgressSend", proxyOpId, getpid(), 1, event->progrTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize); } else { - int posted = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvPosted); - int received = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvReceived); - int transmitted = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvTransmitted); - int done = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvDone); - fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu, \"POSTED\": {\"step\": %d, \"ts\": %f}, \"RECEIVED\": {\"step\": %d, \"ts\": %f}, \"TRANSMITTED\": {\"step\": %d, \"ts\": %f}, \"DONE\": {\"step\": %d, \"ts\": %f}}},\n", - "Recv", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize, event->states[posted].steps, event->states[posted].timestamp, event->states[received].steps, event->states[received].timestamp, event->states[transmitted].steps, event->states[transmitted].timestamp, event->states[done].steps, event->states[done].timestamp); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu}},\n", + "ScheduleRecv", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + "ScheduleRecv", proxyOpId, getpid(), 1, event->progrTs); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu}},\n", + "ProgressRecv", proxyOpId, getpid(), 1, event->progrTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize); } } __hidden void printProxyOpEventTrailer(FILE* fh, struct proxyOp* event) { fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", - event->isSend ? "Send" : "Recv", proxyOpId++, getpid(), 1, event->stopTs); + event->isSend ? "ProgressSend" : "ProgressRecv", proxyOpId++, getpid(), 1, event->stopTs); } static __thread int proxyStepId; __hidden void printProxyStepEventHeader(FILE* fh, struct proxyStep* event) { if (event->isSend) { fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", - "SendBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step); + "SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_GPU_WAIT], event->step); fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", - "SendBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendGPUWait)]); + "SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_PEER_WAIT]); fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", - "SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendGPUWait)], event->step); + "SendPeerWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_PEER_WAIT], event->step); fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", - "SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)]); + "SendPeerWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_WAIT]); fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", - "SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)], event->step); + "SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_WAIT], event->step); } else { fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", - "RecvBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step); - fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", - "RecvBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)]); - fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", - "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)], event->step); + "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_WAIT], event->step); } } @@ -100,13 +96,13 @@ __hidden void printProxyStepEventTrailer(FILE* fh, struct proxyStep* event) { "SendWait", proxyStepId++, getpid(), 1, event->stopTs); } else { fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", - "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)]); + "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_FLUSH_WAIT]); fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", - "RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)], event->step); + "RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_FLUSH_WAIT], event->step); fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", - "RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvGPUWait)]); + "RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_GPU_WAIT]); fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n", - "RecvGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvGPUWait)], event->step); + "RecvGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_GPU_WAIT], event->step); fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", "RecvGpuWait", proxyStepId++, getpid(), 1, event->stopTs); } @@ -115,8 +111,8 @@ __hidden void printProxyStepEventTrailer(FILE* fh, struct proxyStep* event) { static __thread int kernelId; __hidden void printKernelChEventHeader(FILE* fh, struct kernelCh* event) { if (event->type != ncclProfileKernelCh) return; - fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d}},\n", - "KernelCh", kernelId, getpid(), 1, event->startTs, event->channelId); + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"StartGpuClk\": %lu, \"StopGpuClk\": %lu}},\n", + "KernelCh", kernelId, getpid(), 1, event->startTs, event->channelId, event->startGpuClk, event->stopGpuClk); } __hidden void printKernelChEventTrailer(FILE* fh, struct kernelCh* event) { @@ -134,6 +130,8 @@ __hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) { str = "Sleep"; } else if (event->state == ncclProfilerProxyCtrlAppend || event->state == ncclProfilerProxyCtrlAppendEnd) { str = "Append"; + } else { + return; } if (event->state == ncclProfilerProxyCtrlAppendEnd) { fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"appended\": %d}},\n", @@ -188,9 +186,8 @@ void debugEvent(void* eHandle, const char* tag) { fprintf(fh, "Collective event %p tag = %s {\n", event, tag); fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED)); fprintf(fh, " parent = %p\n", event->base.parent); - for (int j = 0; j < MAX_OPS; j++) { - for (int i = 0; i < MAX_CHANNELS; i++) if (event->send[i][j].type == ncclProfileProxyOp) fprintf(fh, " send[%d] = %p\n", i, &event->send[i]); - for (int i = 0; i < MAX_CHANNELS; i++) if (event->recv[i][j].type == ncclProfileProxyOp) fprintf(fh, " recv[%d] = %p\n", i, &event->recv[i]); + for (int j = 0; j < 2*MAX_OPS; j++) { + for (int i = 0; i < MAX_CHANNELS; i++) if (event->op[i][j].type == ncclProfileProxyOp) fprintf(fh, " op[%d] = %p\n", i, &event->op[i]); } fprintf(fh, " startTs = %f\n", event->base.startTs); fprintf(fh, " stopTs = %f\n", event->base.stopTs); @@ -207,17 +204,18 @@ void debugEvent(void* eHandle, const char* tag) { } else if (type == ncclProfileProxyOp) { struct proxyOp* event = (struct proxyOp *)eHandle; fprintf(fh, "ProxyOp event %p tag = %s {\n", event, tag); - fprintf(fh, " type = %s\n", event->isSend ? "Send" : "Recv"); + fprintf(fh, " type = %s\n", event->isSend < 0 ? "Unknown" : event->isSend ? "Send" : "Recv"); fprintf(fh, " channel = %d\n", event->channelId); fprintf(fh, " parent = %p\n", event->parent); fprintf(fh, " rank = %d\n", event->rank); fprintf(fh, " startTs = %f\n", event->startTs); + fprintf(fh, " progrTs = %f\n", event->progrTs); fprintf(fh, " stopTs = %f\n", event->stopTs); fprintf(fh, "}\n"); } else if (type == ncclProfileProxyStep) { struct proxyStep* event = (struct proxyStep *)eHandle; fprintf(fh, "ProxyStep event %p tag = %s {\n", event, tag); - fprintf(fh, " type = %s\n", event->isSend ? "Send" : "Recv"); + fprintf(fh, " type = %s\n", event->isSend < 0 ? "Unknown" : event->isSend ? "Send" : "Recv"); fprintf(fh, " parent = %p\n", event->parent); fprintf(fh, " startTs = %f\n", event->startTs); fprintf(fh, " stopTs = %f\n", event->stopTs); @@ -260,8 +258,7 @@ void printEvent(FILE* fh, void* handle) { for (int i = 0; i < MAX_CHANNELS; i++) { printKernelChEventHeader(fh, &c->kernel[i]); for (int j = 0; j < c->nProxyOps[i]; j++) { - printEvent(fh, &c->send[i][j]); - printEvent(fh, &c->recv[i][j]); + printEvent(fh, &c->op[i][j]); } printKernelChEventTrailer(fh, &c->kernel[i]); } diff --git a/ext-profiler/example/print_event.h b/ext-profiler/example/print_event.h index 8e2db4c2d..e32560dca 100644 --- a/ext-profiler/example/print_event.h +++ b/ext-profiler/example/print_event.h @@ -7,6 +7,9 @@ #ifndef PRINT_EVENT_H_ #define PRINT_EVENT_H_ +#include "nccl/common.h" +extern ncclDebugLogger_t logFn; + void debugEvent(void* eHandle, const char* tag); void printEvent(FILE* fh, void* handle); diff --git a/makefiles/common.mk b/makefiles/common.mk index 545203a10..8a35a8fab 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -17,6 +17,8 @@ PROFAPI ?= 1 NVTX ?= 1 RDMA_CORE ?= 0 NET_PROFILER ?= 0 +MLX5DV ?= 0 +MAX_EXT_NET_PLUGINS ?= 0 NVCC = $(CUDA_HOME)/bin/nvcc @@ -49,8 +51,10 @@ CUDA11_PTX = -gencode=arch=compute_80,code=compute_80 CUDA12_PTX = -gencode=arch=compute_90,code=compute_90 CUDA13_PTX = -gencode=arch=compute_120,code=compute_120 - -ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 12; echo $$?),0) +ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0) +# Prior to SM75 is deprecated from CUDA13.0 onwards + NVCC_GENCODE ?= $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX) +else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8; echo $$?),0) # Include Blackwell support if we're using CUDA12.8 or above NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX) else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0) @@ -66,14 +70,21 @@ else endif $(info NVCC_GENCODE is ${NVCC_GENCODE}) +# CUDA 13.0 requires c++17 +ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0) + CXXSTD ?= -std=c++17 +else + CXXSTD ?= -std=c++11 +endif + CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \ - -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla \ - -I $(CUDA_INC) \ + -Wall -Wno-unused-function -Wno-sign-compare $(CXXSTD) -Wvla \ + -I $(CUDA_INC) -I $(CUDA_INC)/cccl \ $(CXXFLAGS) # Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors) # 512 : 120, 640 : 96, 768 : 80, 1024 : 60 # We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions. -NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all +NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) $(CXXSTD) --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all # Use addprefix so that we can specify more than one path NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt @@ -136,9 +147,17 @@ CXXFLAGS += -DPROFAPI endif ifneq ($(RDMA_CORE), 0) -CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1 +CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1 -libverbs +endif + +ifneq ($(MLX5DV), 0) +CXXFLAGS += -DNCCL_BUILD_MLX5DV=1 -lmlx5 endif ifneq ($(NET_PROFILER), 0) CXXFLAGS += -DNCCL_ENABLE_NET_PROFILING=1 endif + +ifneq ($(MAX_EXT_NET_PLUGINS), 0) +CXXFLAGS += -DNCCL_NET_MAX_PLUGINS=$(MAX_EXT_NET_PLUGINS) +endif diff --git a/makefiles/version.mk b/makefiles/version.mk index 5c0b0de9a..f41e7a783 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 26 -NCCL_PATCH := 6 +NCCL_MINOR := 27 +NCCL_PATCH := 3 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/Makefile b/src/Makefile index 65da6300b..eab662ef9 100644 --- a/src/Makefile +++ b/src/Makefile @@ -10,7 +10,7 @@ include ../makefiles/version.mk INCEXPORTS := nccl.h LIBSRCFILES := \ bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \ - init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc \ + init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc symmetric.cc \ $(wildcard graph/*.cc) \ $(wildcard misc/*.cc) \ $(wildcard transport/*.cc) \ diff --git a/src/allocator.cc b/src/allocator.cc new file mode 100644 index 000000000..c58181948 --- /dev/null +++ b/src/allocator.cc @@ -0,0 +1,196 @@ +/************************************************************************* + * Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "comm.h" +#include "transport.h" +#include "group.h" + +NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size); +ncclResult_t ncclMemAlloc(void **ptr, size_t size) { + NVTX3_FUNC_RANGE_IN(nccl_domain); + ncclResult_t ret = ncclSuccess; + +#if CUDART_VERSION >= 12010 + size_t memGran = 0; + CUdevice currentDev; + CUmemAllocationProp memprop = {}; + CUmemAccessDesc accessDesc = {}; + CUmemGenericAllocationHandle handle = (CUmemGenericAllocationHandle)-1; + int cudaDev; + int flag; + int dcnt; + + if (ptr == NULL || size == 0) goto fallback; + + if (ncclCudaLibraryInit() != ncclSuccess) goto fallback; + + CUDACHECK(cudaGetDevice(&cudaDev)); + CUCHECK(cuDeviceGet(¤tDev, cudaDev)); + + if (ncclCuMemEnable()) { + size_t handleSize = size; + int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + // Query device to see if FABRIC handle support is available + flag = 0; + (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev)); + if (flag) requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC; + memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes; + memprop.location.id = currentDev; + // Query device to see if RDMA support is available + flag = 0; + CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev)); + if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1; + CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); + CUDACHECK(cudaGetDeviceCount(&dcnt)); + ALIGN_SIZE(handleSize, memGran); + + if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) { + /* First try cuMemCreate() with FABRIC handle support and then remove if it fails */ + CUresult err = CUPFN(cuMemCreate(&handle, handleSize, &memprop, 0)); + if (err == CUDA_ERROR_NOT_PERMITTED || err == CUDA_ERROR_NOT_SUPPORTED) { + requestedHandleTypes &= ~CU_MEM_HANDLE_TYPE_FABRIC; + memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes; + /* Allocate the physical memory on the device */ + CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0)); + } else if (err != CUDA_SUCCESS) { + // Catch and report any error from above + CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0)); + } + } else { + /* Allocate the physical memory on the device */ + CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0)); + } + /* Reserve a virtual address range */ + CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, handleSize, memGran, 0, 0)); + /* Map the virtual address range to the physical allocation */ + CUCHECK(cuMemMap((CUdeviceptr)*ptr, handleSize, 0, handle, 0)); + /* Now allow RW access to the newly mapped memory */ + for (int i = 0; i < dcnt; ++i) { + int p2p = 0; + if (i == cudaDev || ((cudaDeviceCanAccessPeer(&p2p, i, cudaDev) == cudaSuccess) && p2p)) { + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = i; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, handleSize, &accessDesc, 1)); + } + if (0 == p2p && i != cudaDev) INFO(NCCL_ALLOC, "P2P not supported between GPU%d and GPU%d", cudaDev, i); + } + goto exit; + } + +fallback: +#endif + // Coverity is right to complain that we may pass a NULL ptr to cudaMalloc. That's deliberate though: + // we want CUDA to return an error to the caller. + // coverity[var_deref_model] + CUDACHECKGOTO(cudaMalloc(ptr, size), ret, fail); + +exit: + return ret; +fail: + goto exit; +} + +NCCL_API(ncclResult_t, ncclMemFree, void *ptr); +ncclResult_t ncclMemFree(void *ptr) { + NVTX3_FUNC_RANGE_IN(nccl_domain); + ncclResult_t ret = ncclSuccess; + int saveDevice; + + CUDACHECK(cudaGetDevice(&saveDevice)); +#if CUDART_VERSION >= 12010 + CUdevice ptrDev = 0; + + if (ptr == NULL) goto fallback; + if (ncclCudaLibraryInit() != ncclSuccess) goto fallback; + + CUCHECKGOTO(cuPointerGetAttribute((void*)&ptrDev, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr), ret, fail); + CUDACHECKGOTO(cudaSetDevice((int)ptrDev), ret, fail); + if (ncclCuMemEnable()) { + NCCLCHECKGOTO(ncclCuMemFree(ptr), ret, fail); + goto exit; + } + +fallback: +#endif + CUDACHECKGOTO(cudaFree(ptr), ret, fail); + +exit: + CUDACHECK(cudaSetDevice(saveDevice)); + return ret; +fail: + goto exit; +} + +// This is a collective function and should be called by all ranks in the communicator +ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr) { + ncclResult_t ret = ncclSuccess; + void* regSymAddr = NULL; + size_t allocSize = size; + size_t granularity; + CUdevice cuDev; + CUmemAllocationProp memprop = {}; + CUmemGenericAllocationHandle memHandle; + int bit = 0, cnt = 0; + + // aligment must be power of 2 as an input + while (bit < sizeof(size_t) * 8) { + if (alignment & (1L << bit)) cnt++; + if (cnt == 2) { + WARN("rank %d alignment %ld is not power of 2", comm->rank, alignment); + goto fail; + } + bit++; + } + // temporarily align the alignment to NCCL_REC_PAGE_SIZE + ALIGN_SIZE(alignment, NCCL_REC_PAGE_SIZE); + + CUCHECKGOTO(cuDeviceGet(&cuDev, comm->cudaDev), ret, fail); + memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + memprop.requestedHandleTypes = ncclCuMemHandleType; + memprop.location.id = cuDev; + CUCHECKGOTO(cuMemGetAllocationGranularity(&granularity, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail); + ALIGN_SIZE(allocSize, granularity); + + CUCHECKGOTO(cuMemCreate(&memHandle, allocSize, &memprop, 0), ret, fail); + ALIGN_SIZE(comm->symAllocHead, alignment); + NCCLCHECKGOTO(ncclIpcSymmetricMap(comm, comm->symAllocHead, allocSize, memHandle, ®SymAddr), ret, fail); + NCCLCHECKGOTO(ncclNvlsSymmetricMap(comm, comm->symAllocHead, allocSize, regSymAddr), ret, fail); + NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail); + comm->symAllocHead += allocSize; + *symPtr = regSymAddr; + +exit: + return ret; +fail: + *symPtr = NULL; + goto exit; +} + +ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr) { + CUmemGenericAllocationHandle handle; + size_t size = 0; + ncclResult_t ret = ncclSuccess; + int saveDev = comm->cudaDev; + CUDACHECKGOTO(cudaGetDevice(&saveDev), ret, fail); + if (ncclCuMemEnable()) { + CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail); + CUCHECKGOTO(cuMemRetainAllocationHandle(&handle, symPtr), ret, fail); + CUCHECKGOTO(cuMemRelease(handle), ret, fail); + CUCHECKGOTO(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)symPtr), ret, fail); + NCCLCHECKGOTO(ncclNvlsSymmetricFree(comm, size, symPtr), ret, fail); + NCCLCHECKGOTO(ncclIpcSymmetricFree(comm, size, symPtr), ret, fail); + CUCHECKGOTO(cuMemRelease(handle), ret, fail); + } +exit: + CUDACHECK(cudaSetDevice(saveDev)); + return ret; +fail: + goto exit; +} diff --git a/src/bootstrap.cc b/src/bootstrap.cc index 9e24faadf..f05337249 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -94,6 +94,7 @@ ncclResult_t bootstrapNetInit() { pthread_mutex_lock(&bootstrapNetLock); if (bootstrapNetInitDone == 0) { const char* env = ncclGetEnv("NCCL_COMM_ID"); + int nIfs = 0; if (env) { union ncclSocketAddress remoteAddr; if (ncclSocketGetAddrFromString(&remoteAddr, env) != ncclSuccess) { @@ -101,13 +102,15 @@ ncclResult_t bootstrapNetInit() { pthread_mutex_unlock(&bootstrapNetLock); return ncclInvalidArgument; } - if (ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) { + NCCLCHECK(ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, + &nIfs)); + if (nIfs <= 0) { WARN("NET/Socket : No usable listening interface found"); pthread_mutex_unlock(&bootstrapNetLock); return ncclSystemError; } } else { - int nIfs = ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1); + NCCLCHECK(ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1, &nIfs)); if (nIfs <= 0) { WARN("Bootstrap : no socket interface found"); pthread_mutex_unlock(&bootstrapNetLock); @@ -828,7 +831,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks), ret, fail); memcpy(state->peerP2pAddresses + rank, &peerSocketAddress, sizeof(union ncclSocketAddress)); - if (parent->config.splitShare) { + if (parent->shareResources) { /* map local rank to top parent local rank. */ for (int i = 0; i < nranks; ++i) { comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]]; diff --git a/src/channel.cc b/src/channel.cc index bc48986d8..c2b88414b 100644 --- a/src/channel.cc +++ b/src/channel.cc @@ -147,7 +147,7 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks) { int nPeers = nRanks + collnetNRanks + nvlsNRanks; /* channel peers are only valid when async init thread completes commAlloc() and - * the channel is intialized with initChannel(); if either is not done, this channel + * the channel is initialized with initChannel(); if either is not done, this channel * should never be free. */ if (channel->id == -1 || channel->peers == NULL) return ncclSuccess; diff --git a/src/debug.cc b/src/debug.cc index e2cc4f810..f034bc7e0 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -16,6 +16,8 @@ #include #include "param.h" +#define NCCL_DEBUG_RESET_TRIGGERED (-2) + int ncclDebugLevel = -1; static uint32_t ncclDebugTimestampLevels = 0; // bitmaps of levels that have timestamps turned on static char ncclDebugTimestampFormat[256]; // with space for subseconds @@ -26,7 +28,7 @@ static int pid = -1; static char hostname[1024]; thread_local int ncclDebugNoWarn = 0; char ncclLastError[1024] = ""; // Global string for the last error in human readable form -static uint64_t ncclDebugMask = NCCL_INIT | NCCL_BOOTSTRAP | NCCL_ENV; // Default debug sub-system mask is INIT and ENV +static uint64_t ncclDebugMask = 0; FILE *ncclDebugFile = stdout; static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER; static std::chrono::steady_clock::time_point ncclEpoch; @@ -34,11 +36,16 @@ static bool ncclWarnSetDebugInfo = false; static __thread int tid = -1; +// This function must be called with ncclDebugLock locked! static void ncclDebugInit() { - pthread_mutex_lock(&ncclDebugLock); - if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; } const char* nccl_debug = ncclGetEnv("NCCL_DEBUG"); int tempNcclDebugLevel = -1; + uint64_t tempNcclDebugMask = NCCL_INIT | NCCL_BOOTSTRAP | NCCL_ENV; // Default debug sub-system mask + if (ncclDebugLevel == NCCL_DEBUG_RESET_TRIGGERED && ncclDebugFile != stdout) { + // Finish the reset initiated via ncclResetDebugInit(). + fclose(ncclDebugFile); + ncclDebugFile = stdout; + } if (nccl_debug == NULL) { tempNcclDebugLevel = NCCL_LOG_NONE; } else if (strcasecmp(nccl_debug, "VERSION") == 0) { @@ -61,7 +68,7 @@ static void ncclDebugInit() { if (ncclDebugSubsysEnv != NULL) { int invert = 0; if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; } - ncclDebugMask = invert ? ~0ULL : 0ULL; + tempNcclDebugMask = invert ? ~0ULL : 0ULL; char *ncclDebugSubsys = strdup(ncclDebugSubsysEnv); char *subsys = strtok(ncclDebugSubsys, ","); while (subsys != NULL) { @@ -102,7 +109,7 @@ static void ncclDebugInit() { mask = NCCL_ALL; } if (mask) { - if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask; + if (invert) tempNcclDebugMask &= ~mask; else tempNcclDebugMask |= mask; } subsys = strtok(NULL, ","); } @@ -246,15 +253,15 @@ static void ncclDebugInit() { if (debugFn[0] != '\0') { FILE *file = fopen(debugFn, "w"); if (file != nullptr) { - setbuf(file, nullptr); // disable buffering + setlinebuf(file); // disable block buffering ncclDebugFile = file; } } } ncclEpoch = std::chrono::steady_clock::now(); + ncclDebugMask = tempNcclDebugMask; __atomic_store_n(&ncclDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE); - pthread_mutex_unlock(&ncclDebugLock); } /* Common logging function used by the INFO, WARN and TRACE macros @@ -262,19 +269,38 @@ static void ncclDebugInit() { * they can share the debugging mechanisms and output files */ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) { - if (__atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE) == -1) ncclDebugInit(); + bool locked = false; // Keeps track of the ncclDebugLock state. + int gotLevel = __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE); + if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; } // Save the last error (WARN) as a human readable string if (level == NCCL_LOG_WARN) { pthread_mutex_lock(&ncclDebugLock); + locked = true; va_list vargs; va_start(vargs, fmt); (void) vsnprintf(ncclLastError, sizeof(ncclLastError), fmt, vargs); va_end(vargs); + } + + if (gotLevel >= 0 && (gotLevel < level || (flags & ncclDebugMask) == 0)) { + if (locked) + pthread_mutex_unlock(&ncclDebugLock); + return; + } + + if (!locked) { + pthread_mutex_lock(&ncclDebugLock); + locked = true; + } + // From this point on ncclDebugLock is always locked so we don't need to check "locked" anymore. + if (ncclDebugLevel < 0) + ncclDebugInit(); + if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) { pthread_mutex_unlock(&ncclDebugLock); + return; } - if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return; if (tid == -1) { tid = syscall(SYS_gettid); @@ -335,7 +361,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file // Add level specific formatting. if (level == NCCL_LOG_WARN) { len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] %s:%d NCCL WARN ", cudaDev, filefunc, line); - if (ncclWarnSetDebugInfo) ncclDebugLevel = NCCL_LOG_INFO; + if (ncclWarnSetDebugInfo) __atomic_store_n(&ncclDebugLevel, NCCL_LOG_INFO, __ATOMIC_RELEASE); } else if (level == NCCL_LOG_INFO) { len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] NCCL INFO ", cudaDev); } else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) { @@ -360,19 +386,17 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file // necessary since we write bytes instead of the string. buffer[len++] = '\n'; fwrite(buffer, 1, len, ncclDebugFile); + pthread_mutex_unlock(&ncclDebugLock); } NCCL_API(void, ncclResetDebugInit); void ncclResetDebugInit() { // Cleans up from a previous ncclDebugInit() and reruns. // Use this after changing NCCL_DEBUG and related parameters in the environment. - __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE); - if (ncclDebugFile != stdout) { - fclose(ncclDebugFile); - ncclDebugFile = stdout; - } - ncclDebugLevel = -1; - ncclDebugInit(); + pthread_mutex_lock(&ncclDebugLock); + // Let ncclDebugInit() know to complete the reset. + __atomic_store_n(&ncclDebugLevel, NCCL_DEBUG_RESET_TRIGGERED, __ATOMIC_RELEASE); + pthread_mutex_unlock(&ncclDebugLock); } NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0); diff --git a/src/device/Makefile b/src/device/Makefile index 3562563fc..df58489a0 100644 --- a/src/device/Makefile +++ b/src/device/Makefile @@ -23,6 +23,9 @@ INCFLAGS = -I. -I.. -I$(BUILDDIR)/include -I../include NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden" CXXFLAGS += $(INCFLAGS) +NVCUFLAGS_SYM := -ccbin $(CXX) $(CXXSTD) --expt-extended-lambda -Xptxas -maxrregcount=128 -Xfatbin -compress-all +NVCUFLAGS_SYM += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden" + SAY = @bash -c 'path="$$2"; [[ "$$(realpath "$$2")" =~ ^$(subst .,\.,$(abspath $(NCCLDIR)))/(.*)$$ ]] && path="$${BASH_REMATCH[1]}"; printf "%-15s %s\n" "$$1" "$$path"' SAY COMPILE.cu = $(NVCC) $(NVCUFLAGS) -dc $2 -o $1 @@ -30,7 +33,22 @@ COMPILE.cc = $(CXX) $(CXXFLAGS) -c $2 -o $1 define COMPILE @$(SAY) "Compiling" $2;\ mkdir -p $(dir $1);\ - $(call COMPILE$(suffix $2),$1,$2) + $(call COMPILE$(or $3,$(suffix $2)),$1,$2) +endef + +ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12080))"),1) + NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100a,code=sm_100a \ + -gencode=arch=compute_120a,code=sm_120a +else ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12070))"),1) + NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100a,code=sm_100a +else + NVCC_GENCODE_LDMC_FP8 = +endif + +define COMPILE_SYM +@$(SAY) "Compiling" $2;\ + mkdir -p $(dir $1);\ + $(NVCC) $(NVCUFLAGS_SYM) $3 -dw $2 -o $1 endef DEPENDS.cu = $(NVCC) $(NVCUFLAGS) -M -dc $1 @@ -48,8 +66,6 @@ endef all: $(MANIFEST) -ifeq (1,1) -# Case if the directory is generated on-demand: $(OBJDIR)/gensrc: generate.py @mkdir -p $@ (which python3 >/dev/null || \ @@ -57,22 +73,26 @@ $(OBJDIR)/gensrc: generate.py printf "\n$${bar}\nERROR: Building NCCL requires a Python 3 installation invokable as 'python3'.\n$${bar}\n\n" 1>&2; \ exit 1)) \ && ./generate.py $@ "$(ONLY_FUNCS)" -else -# Case if the directory is pre-generated and checked in the repo as ./gen: -$(OBJDIR)/gensrc: - @mkdir -p $(OBJDIR); ln -srfn ./gen $@ -endif + +$(OBJDIR)/gensrc/symmetric: $(OBJDIR)/gensrc symmetric/generate.py + @mkdir -p $@ + ./symmetric/generate.py $@ # The trailing ";" is necessary to make this an "empty recipe": # https://www.gnu.org/software/make/manual/html_node/Empty-Recipes.html $(OBJDIR)/gensrc/rules.mk: $(OBJDIR)/gensrc ; +$(OBJDIR)/gensrc/symmetric/rules.mk: $(OBJDIR)/gensrc/symmetric ; + -include $(OBJDIR)/gensrc/rules.mk # "gensrc/rules.mk" populates $(LIB_OBJS_GEN) +-include $(OBJDIR)/gensrc/symmetric/rules.mk +# "gensrc/symmetric/rules.mk" populates $(LIB_OBJS_SYM_GEN) + SRCS = common.cu onerank.cu -LIB_OBJS = $(patsubst %, $(OBJDIR)/%.o, $(SRCS)) $(LIB_OBJS_GEN) +LIB_OBJS = $(patsubst %, $(OBJDIR)/%.o, $(SRCS)) $(LIB_OBJS_GEN) $(LIB_OBJS_SYM_GEN) $(OBJDIR)/%.o: % $(OBJDIR)/%.d $(call COMPILE,$@,$<) @@ -80,12 +100,18 @@ $(OBJDIR)/%.o: % $(OBJDIR)/%.d $(OBJDIR)/genobj/%.o: $(OBJDIR)/gensrc $(OBJDIR)/genobj/%.d $(call COMPILE,$@,$(OBJDIR)/gensrc/$*) +$(OBJDIR)/genobj/symmetric/%.o: $(OBJDIR)/gensrc/symmetric $(OBJDIR)/genobj/symmetric/%.d + $(call COMPILE,$@,$(OBJDIR)/gensrc/symmetric/$*) + $(OBJDIR)/%.d: % $(call DEPENDS,$@,$<) $(OBJDIR)/genobj/%.d: $(OBJDIR)/gensrc/% $(call DEPENDS,$@,$<) +$(OBJDIR)/genobj/symmetric/%.d: $(OBJDIR)/gensrc/symmetric/% + $(call DEPENDS,$@,$<) + $(DEVGLUE_OBJ): $(LIB_OBJS) $(NVCC) $(NVCUFLAGS) -dlink $^ -o $@ @@ -94,6 +120,7 @@ $(MANIFEST): $(LIB_OBJS) $(DEVGLUE_OBJ) -include $(wildcard $(OBJDIR)/*.d) -include $(wildcard $(OBJDIR)/genobj/*.d) +-include $(wildcard $(OBJDIR)/genobj/symmetric/*.d) .PHONY: clean clean: diff --git a/src/device/all_gather.h b/src/device/all_gather.h index 854ebbf3a..db967861e 100644 --- a/src/device/all_gather.h +++ b/src/device/all_gather.h @@ -173,73 +173,221 @@ struct RunWorkColl struct RunWorkColl { + template + struct Scatterer { + struct ncclDevWorkColl* work; + ssize_t chunkSize; + ssize_t railGridOffset; + + template + __device__ __forceinline__ void operator()( + int tid, int tn, int slice, int maxSliceSize, + int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag + ) { + static_assert(SlicePerChunk==1, "require: SlicePerChunk==1"); + static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1"); + + struct ncclNvls* nvls = &ncclShmem.channel.nvls; + int nNodes = ncclShmem.comm.nNodes; + int nRails = nvls->nHeads; + int part = ncclShmem.channelId - work->channelLo; + char* inbuf = (char*)work->sendbuff; + char* outbuf = (char*)work->recvbuff; + ssize_t countPerRank = work->collnet.count; + bool inPlace = (inbuf == outbuf + ncclShmem.comm.rank * countPerRank); + ssize_t railAllBeg = min(railGridOffset + part * chunkSize, nNodes * countPerRank); + ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * countPerRank); + int railAllSize = railAllEnd - railAllBeg; + int rail = 0; + int src = 0; + + if (BcastSendNotRecv) { + rail = nvls->headRank; + } else { + if (work->regUsed) return; + rail = 0; + } + if (tid < nDsts) dstSizes[tid] = railAllSize; + do { + int node = railAllBeg / countPerRank; + int railAllOffset = 0; + while (railAllOffset < railAllSize) { + ssize_t railOneBeg = node * countPerRank; + ssize_t railOneEnd = railOneBeg + countPerRank; + ssize_t railOneOffset = (railAllBeg + railAllOffset) - railOneBeg; + int delta = min(railAllEnd, railOneEnd) - (railAllBeg + railAllOffset); + int rank = ncclShmem.comm.collNetDenseToUserRank[node * nRails + rail]; + ssize_t userOneBeg = rank * countPerRank + railOneOffset; + int outIsDst = (inPlace && rank == ncclShmem.comm.rank) || BcastSendNotRecv || work->regUsed ? 0 : 1; + if (nSrcs != 0 && outIsDst + nDsts != 0) { + reduceCopy + (tid, tn, 0, nullptr, false, + /*nSrcs=*/1, [=]__device__(int s/*==0*/) -> void* { + return (char*)srcPtrs[src] + railAllOffset; + }, + /*nDsts=*/outIsDst + nDsts, [=]__device__(int d) -> void* { + return d < outIsDst ? outbuf + userOneBeg + : work->regUsed ? (char*)dstPtrs[d - outIsDst] + userOneBeg + : (char*)dstPtrs[d - outIsDst] + railAllOffset; + }, delta); + } + railAllOffset += delta; + node += 1; + } + rail += 1; + src += 1; + } while (!BcastSendNotRecv && src < nRails); + } + }; + __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) { struct ncclNvls* nvls = &ncclShmem.channel.nvls; - const ssize_t rank = ncclShmem.comm.rank; - size_t count, gridOffset, channelCount; - size_t chunkCount; - ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount); - size_t offset; int nelem; - const int nThreadsBcast = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE; - const int nThreadsGather = work->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast; - const int tidEndGather = nThreadsGather; - const int tidEndBcast = tidEndGather + nThreadsBcast; + const int nThreadsNetSend = work->oneNode ? 0 : (work->netRegUsed ? WARP_SIZE : 6 * WARP_SIZE); + const int nThreadsGather = work->regUsed ? roundUp(nvls->nHeads << 2, WARP_SIZE) : 8 * WARP_SIZE; + const int nThreadsBcast = NCCL_MAX_NTHREADS - nThreadsNetSend - nThreadsGather; - if (!work->regUsed) { - if (tid < tidEndGather) { - // Gather - using Proto = ProtoSimple<1, 1, COLL_UNROLL>; - Primitives, /*Direct=*/0, Proto, 0> - prims(tid, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff, - work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); - for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { - offset = gridOffset + elemOffset; - nelem = min(chunkCount, channelCount - elemOffset); - prims.gather(offset, nvls->nHeads * count, nelem, count, -1, 0); + const int tidEndGather = nThreadsGather; + const int tidEndNetSend = tidEndGather + nThreadsNetSend; + const int tidEndBcast = tidEndNetSend + nThreadsBcast; + + if (work->oneNode) { + const ssize_t rank = ncclShmem.comm.rank; + size_t count, gridOffset, channelCount, offset, chunkCount; + ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount); + if (!work->regUsed) { + if (tid < tidEndGather) { + // Gather + using Proto = ProtoSimple<1, 1, COLL_UNROLL>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff, + work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); + for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { + offset = gridOffset + elemOffset; + nelem = min(chunkCount, channelCount - elemOffset); + prims.gather(offset, nvls->nHeads * count, nelem, count, -1, 0); + } + // coverity[overrun-call] => Coverity think prims.index can be greater than 1 + } else if (tid < tidEndBcast) { + // Bcast through NVLS + using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, work->sendbuff, NULL, + work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0); + for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { + offset = gridOffset + elemOffset; + nelem = min(chunkCount, channelCount - elemOffset); + prims.send(offset, nelem); + } + // coverity[overrun-call] => Coverity think prims.index can be greater than 1 } - // coverity[overrun-call] => Coverity think prims.index can be greater than 1 - } else if (tid < tidEndBcast) { - // Bcast through NVLS - using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; - Primitives, /*Direct=*/0, Proto, 0> - prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, work->sendbuff, NULL, - work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0); - for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { - offset = gridOffset + elemOffset; - nelem = min(chunkCount, channelCount - elemOffset); - prims.send(offset, nelem); + } else { + if (tid < tidEndGather) { + using Proto = ProtoSimple<1, 1, COLL_UNROLL>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid, nThreadsGather, nvls->up, nvls->up, NULL, NULL, + work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); + + /* used as sync */ + prims.scatter(0, 0, 0, 0, -1, 0); + + for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { + prims.gather(0, 0, 0, 0, -1, 0); + } + } else if (tid < tidEndBcast) { + using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; + Primitives, /*Direct=*/1, Proto, 0> + prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, work->sendbuff, NULL, + work->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, work); + /* used as sync */ + prims.recv(0, 0); + + for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { + ssize_t inpOffset = gridOffset + elemOffset; + ssize_t outOffset = inpOffset + rank * count; + nelem = min(chunkCount, channelCount - elemOffset); + prims.directSend(inpOffset, outOffset, nelem); + } } - // coverity[overrun-call] => Coverity think prims.index can be greater than 1 } } else { - /* direct allgather */ + // NVLS + IB SHARP + int nNodes = ncclShmem.comm.nNodes; + int part = ncclShmem.channelId - work->channelLo; + ssize_t countPerRank = work->collnet.count; + const int nChannels = work->channelHi - work->channelLo + 1; + ssize_t chunkCount = work->collnet.chunkCount; if (tid < tidEndGather) { using Proto = ProtoSimple<1, 1, COLL_UNROLL>; - Primitives, /*Direct=*/0, Proto, 0> - prims(tid, nThreadsGather, nvls->up, nvls->up, NULL, NULL, - work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); - - /* used as sync */ - prims.scatter(0, 0, 0, 0, -1, 0); - - for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { - prims.gather(0, 0, 0, 0, -1, 0); + Primitives, /*Direct=*/1, Proto, 0> + prims(tid, nThreadsGather, nvls->up, nullptr, nullptr, work->recvbuff, + /*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 1, 1, work); + for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) { + Scatterer scat; + scat.work = work; + scat.chunkSize = chunkCount; + scat.railGridOffset = railGridOffset; + prims.template process(scat); } - } else if (tid < tidEndBcast) { - using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; - Primitives, /*Direct=*/1, Proto, 0> - prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, work->sendbuff, NULL, - work->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, work); - /* used as sync */ - prims.recv(0, 0); - - for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { - ssize_t inpOffset = gridOffset + elemOffset; - ssize_t outOffset = inpOffset + rank * count; - nelem = min(chunkCount, channelCount - elemOffset); - prims.directSend(inpOffset, outOffset, nelem); + } else { + if (work->netRegUsed) { + using ProtoSend = ProtoSimple<1, 1, COLL_UNROLL>; + using ProtoBcast = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; + int maxSteps = (int)divUp(nNodes * countPerRank, nChannels * chunkCount); + int curSteps = -1; + int postThread = tid - tidEndGather == 0 ? 1 : 0; + // for UB, we need to control the send speed to avoid net congestion. + // first unroll 2 steps, then unroll the rest steps when the data is received. + if (postThread) { + curSteps = min(2, maxSteps); + Primitives, /*Direct=*/1, ProtoSend, 0>::sendPeerNotify(nvls->out, 1, curSteps); + } + Primitives, /*Direct=*/1, ProtoBcast, 0> + prims(tid - tidEndGather, nThreadsNetSend + nThreadsBcast, &nvls->out, &nvls->down, nullptr, nullptr, + /*redOpArg=*/0, 2 * ProtoBcast::MaxGroupWidth, 0, 0, work); + for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) { + Scatterer scat; + scat.work = work; + scat.chunkSize = chunkCount; + scat.railGridOffset = railGridOffset; + prims.template process(scat); + if (postThread && curSteps < maxSteps) { + curSteps++; + Primitives, /*Direct=*/1, ProtoSend, 0>::sendPeerNotify(nvls->out, 1, 1); + } + } + } else { + if (tid < tidEndNetSend) { + using Proto = ProtoSimple<1, 1, COLL_UNROLL>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid - tidEndGather, nThreadsNetSend, nullptr, &nvls->out, work->sendbuff, nullptr, + /*redOpArg=*/0, 0 * Proto::MaxGroupWidth, 1, 1); + for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) { + ssize_t railAllBeg = railGridOffset + part * chunkCount; + ssize_t railAllEnd = min(railAllBeg + chunkCount, nNodes * countPerRank); + ssize_t railOneBeg = ncclShmem.comm.node * countPerRank; + ssize_t railOneEnd = railOneBeg + countPerRank; + ssize_t beg = max(railAllBeg, railOneBeg); + ssize_t end = min(railAllEnd, railOneEnd); + prims.send(beg - railOneBeg, max(ssize_t(0), end - beg)); + } + } else { + using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid - tidEndNetSend, nThreadsBcast, &nvls->out, &nvls->down, nullptr, nullptr, + /*redOpArg=*/0, 2 * Proto::MaxGroupWidth, 0, 0); + for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) { + Scatterer scat; + scat.work = work; + scat.chunkSize = chunkCount; + scat.railGridOffset = railGridOffset; + prims.template process(scat); + } + } } } } @@ -254,7 +402,7 @@ struct RunWorkColl + template __device__ __forceinline__ void operator()( int tid, int tn, int slice, int maxSliceSize, int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h index 81da55401..f6b6e9c0e 100644 --- a/src/device/all_reduce.h +++ b/src/device/all_reduce.h @@ -106,7 +106,7 @@ namespace { for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { offset = gridOffset + elemOffset; nelem = min(chunkCount, channelCount - elemOffset); - prims.directSend(offset, nelem); + prims.directSend(offset, offset, nelem); } } else { diff --git a/src/device/common.h b/src/device/common.h index 855db730f..a2884b50c 100644 --- a/src/device/common.h +++ b/src/device/common.h @@ -52,7 +52,6 @@ struct ncclShmemData { uint16_t funcId; int nWorks; int workSize; - uint32_t workConsumed; uint64_t workCounter; bool profilerEnabled; struct ncclShmemGroup groups[NCCL_MAX_GROUPS]; @@ -182,7 +181,6 @@ __device__ __forceinline__ void loadWorkBatchToShmem( } if (tid == 0) { ncclShmem.workSize = workSize; - ncclShmem.workConsumed = batch.offsetBase + (64-__clzll(batch.offsetBitset))*workSize; } // We deliberately replicate these div and mod calculations into the case // blocks above so that they get constant divisor optimizations by the compiler. @@ -242,6 +240,12 @@ __device__ __forceinline__ void loadWorkBatchToShmem( } } +__device__ __forceinline__ unsigned long long int globaltimer() { + unsigned long long int timer; + asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(timer)); + return timer; +} + template struct RunWorkColl { __device__ void run(int tid, int tn, struct ncclDevWorkColl* work) { @@ -296,40 +300,30 @@ struct RunWorkBatch { #define STOP 1 #define FINI 2 -__device__ __forceinline__ bool profilerEnabled(void) { - // Check if any of the workItems in the batch is profiled. If so, there is an equivalent - // profiler ProxyOp waiting for the counter update in the host thread. If this check was - // done only for the first workItem the profiler counter for other workItems in the batch - // could never be updated, leaving the host thread spinning forever for the counter update - // and causing a hang. - bool enabled = false; - for (int i = 0; i < ncclShmem.nWorks && !enabled; i++) { - if (ncclShmem.workType == ncclDevWorkTypeP2p) - enabled = ((struct ncclDevWorkP2p*)ncclShmem.workStorage)[i].profilerEnabled; - else - enabled = ((struct ncclDevWorkColl*)ncclShmem.workStorage)[i].profilerEnabled; - } - return enabled; +__device__ __forceinline__ bool profilerEnabled(int workItemIdx) { + return (ncclShmem.workType == ncclDevWorkTypeP2p) ? + ((struct ncclDevWorkP2p*)ncclShmem.workStorage)[workItemIdx].profilerEnabled : + ((struct ncclDevWorkColl*)ncclShmem.workStorage)[workItemIdx].profilerEnabled; } __device__ __forceinline__ void profiler(int action) { - if (action == START) { - if (threadIdx.x == 0) { - // increment workCounter regardless of the profiler being active or not + if (threadIdx.x == 0) { + int idx = 0; + uint64_t wc = ncclShmem.channel.workCounter + 1; + if (action == START) { + for (; wc <= ncclShmem.channel.workCounter + ncclShmem.nWorks; wc++) { + if (!profilerEnabled(idx++)) continue; + ncclShmem.comm.workStarted[ncclShmem.channelId].data[wc%MAX_PROFILER_EVENTS_PER_CHANNEL].timestamp = globaltimer(); + ncclShmem.comm.workStarted[ncclShmem.channelId].data[wc%MAX_PROFILER_EVENTS_PER_CHANNEL].counter = wc; + } + } else { + for (; wc <= ncclShmem.channel.workCounter + ncclShmem.nWorks; wc++) { + if (!profilerEnabled(idx++)) continue; + ncclShmem.comm.workCompleted[ncclShmem.channelId].data[wc%MAX_PROFILER_EVENTS_PER_CHANNEL].timestamp = globaltimer(); + ncclShmem.comm.workCompleted[ncclShmem.channelId].data[wc%MAX_PROFILER_EVENTS_PER_CHANNEL].counter = wc; + } ncclShmem.channel.workCounter += ncclShmem.nWorks; - if(!profilerEnabled()) return; - ncclShmem.comm.workStarted[ncclShmem.channelId] = ncclShmem.channel.workCounter; - } - } else if (action == STOP) { - if (threadIdx.x == 0 && profilerEnabled()) { - ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter; - } - } else { // FINI - if (threadIdx.x == 0) { - // store the workCounter back to vidmem regardless of the profiler being active or not - ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter; - if (!profilerEnabled()) return; - ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter; + if (action == FINI) ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter; } } } @@ -388,11 +382,6 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a } __syncthreads(); // publish ncclShmem - if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) { - // ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads() - ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed; - } - while (ncclShmem.aborted == 0) { profiler(START); if (0 <= SpecializedFnId && ncclShmem.funcId == (unsigned)SpecializedFnId) { @@ -407,11 +396,6 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a profiler(STOP); loadWorkBatchToShmem(tid, tn, args, batchIx); __syncthreads(); - - if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) { - // ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads() - ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed; - } } profiler(FINI); } diff --git a/src/device/generate.py b/src/device/generate.py index b69a2d7cc..f9c3a0e79 100755 --- a/src/device/generate.py +++ b/src/device/generate.py @@ -327,7 +327,7 @@ def partition_by_name(fns): out = f.write impl_names = sorted(name_to_funcs.keys()) names = impl_names + ["host_table.cc", "device_table.cu"] - out("LIB_OBJS_GEN = $(patsubst %, $(OBJDIR)/genobj/%.o, {names})\n" + out("LIB_OBJS_GEN = $(patsubst %,$(OBJDIR)/genobj/%.o,{names})\n" .format(names=" ".join(names))) out("\n") diff --git a/src/device/op128.h b/src/device/op128.h index b2e519d8c..e7da4812c 100644 --- a/src/device/op128.h +++ b/src/device/op128.h @@ -99,37 +99,60 @@ template<> union BytePack<0> {}; template<> union BytePack<1> { - uint8_t u8, native; + uint8_t u8[1], native; }; template<> union BytePack<2> { BytePack<1> half[2]; + BytePack<1> b1[2]; uint8_t u8[2]; - uint16_t u16, native; + uint16_t u16[1], native; }; template<> union BytePack<4> { BytePack<2> half[2]; + BytePack<1> b1[4]; + BytePack<2> b2[2]; uint8_t u8[4]; uint16_t u16[2]; - uint32_t u32, native; + uint32_t u32[1], native; }; template<> union BytePack<8> { BytePack<4> half[2]; + BytePack<1> b1[8]; + BytePack<2> b2[4]; + BytePack<4> b4[2]; uint8_t u8[8]; uint16_t u16[4]; uint32_t u32[2]; - uint64_t u64, native; + uint64_t u64[1], native; }; template<> union alignas(16) BytePack<16> { BytePack<8> half[2]; + BytePack<1> b1[16]; + BytePack<2> b2[8]; + BytePack<4> b4[4]; + BytePack<8> b8[2]; uint8_t u8[16]; uint16_t u16[8]; uint32_t u32[4]; uint64_t u64[2]; - ulong2 ul2, native; + ulong2 ul2[1], native; +}; +template +union BytePack { + BytePack half[2]; + BytePack<1> b1[Size]; + BytePack<2> b2[Size/2]; + BytePack<4> b4[Size/4]; + BytePack<8> b8[Size/8]; + BytePack<16> b16[Size/16]; + uint8_t u8[Size]; + uint16_t u16[Size/2]; + uint32_t u32[Size/4]; + uint64_t u64[Size/8]; }; template @@ -357,19 +380,19 @@ __device__ __forceinline__ void multimem_st_global<0>(uintptr_t addr, BytePack<0 } template<> __device__ __forceinline__ void multimem_st_global<1>(uintptr_t addr, BytePack<1> val) { - asm volatile("st.global.b8 [%0], %1;" :: "l"(addr), "r"((uint32_t)val.u8) : "memory"); + asm volatile("st.global.b8 [%0], %1;" :: "l"(addr), "r"((uint32_t)val.native) : "memory"); } template<> __device__ __forceinline__ void multimem_st_global<2>(uintptr_t addr, BytePack<2> val) { - asm volatile("st.global.b16 [%0], %1;" :: "l"(addr), "h"(val.u16) : "memory"); + asm volatile("st.global.b16 [%0], %1;" :: "l"(addr), "h"(val.native) : "memory"); } template<> __device__ __forceinline__ void multimem_st_global<4>(uintptr_t addr, BytePack<4> val) { - asm volatile("multimem.st.global.b32 [%0], %1;" :: "l"(addr), "r"(val.u32) : "memory"); + asm volatile("multimem.st.global.b32 [%0], %1;" :: "l"(addr), "r"(val.native) : "memory"); } template<> __device__ __forceinline__ void multimem_st_global<8>(uintptr_t addr, BytePack<8> val) { - asm volatile("multimem.st.global.b64 [%0], %1;" :: "l"(addr), "l"(val.u64) : "memory"); + asm volatile("multimem.st.global.b64 [%0], %1;" :: "l"(addr), "l"(val.native) : "memory"); } template<> __device__ __forceinline__ void multimem_st_global<16>(uintptr_t addr, BytePack<16> val) { @@ -384,6 +407,56 @@ __device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack +__device__ __forceinline__ Pack loadPack(T* ptr, int ix, int end) { + constexpr int Size = sizeof(Pack); + ptr += ix; + int n = end - ix; + if (alignof(T) == Size && sizeof(T) == Size) { + return *(Pack*)ptr; + } else if ((Size+3)/4 + 1 < Size/sizeof(T)) { + union { Pack ans; uint32_t part[Size/4]; }; + int misalign = reinterpret_cast(ptr) % 4; + uint32_t* down = reinterpret_cast(reinterpret_cast(ptr) & -uintptr_t(4)); + int i; + #pragma unroll + for (i=0; i < Size/4; i++) { + if (i*4/sizeof(T) < 1 || i*4/sizeof(T) < n) part[i] = down[i]; + } + uint32_t extra; + if (misalign) extra = down[i]; + #pragma unroll + for (i=0; i < Size/4; i++) { + part[i] = __funnelshift_r(part[i], part[i+1], 8*misalign); + } + if (misalign) part[i] = __funnelshift_r(part[i], extra, 8*misalign); + return ans; + } else { + union { Pack ans; BytePack part[Size/sizeof(T)]; }; + #pragma unroll + for (int i=0; i < Size/sizeof(T); i++) { + if (i < 1 || i < n) part[i] = ((BytePack*)ptr)[i]; + } + return ans; + } +} + +// Store pack starting at index in array. Ignore elements past end (length of array). +template +__device__ __forceinline__ void storePack(T* ptr, int ix, int end, Pack val) { + constexpr int Size = sizeof(Pack); + union { Pack tmp; BytePack part[Size/sizeof(T)]; }; + tmp = val; + ptr += ix; + int n = end - ix; + #pragma unroll + for (int i=0; i < Size/sizeof(T); i++) { + if (i < 1 || i < n) ((BytePack*)ptr)[i] = part[i]; + } +} + + // Warp-uniform memory copy from shared address (not generic) to global memory. // The number of bytes copied is `min(MaxBytes, nBytesAhead)`, a negative value // is interpeted as zero. EltSize is the guaranteed alignment of the addresses and sizes. @@ -426,10 +499,10 @@ __device__ __forceinline__ void copyGlobalShared_WarpUnrolled( b4[3] = ld_shared<4>(srcAddr + 3*4); if (srcMisalign != 0) { BytePack<4> b4_4 = ld_shared<4>(srcAddr + 4*4); - b4[0].u32 = __funnelshift_r(b4[0].u32, b4[1].u32, srcMisalign*8); - b4[1].u32 = __funnelshift_r(b4[1].u32, b4[2].u32, srcMisalign*8); - b4[2].u32 = __funnelshift_r(b4[2].u32, b4[3].u32, srcMisalign*8); - b4[3].u32 = __funnelshift_r(b4[3].u32, b4_4.u32, srcMisalign*8); + b4[0].native = __funnelshift_r(b4[0].native, b4[1].native, srcMisalign*8); + b4[1].native = __funnelshift_r(b4[1].native, b4[2].native, srcMisalign*8); + b4[2].native = __funnelshift_r(b4[2].native, b4[3].native, srcMisalign*8); + b4[3].native = __funnelshift_r(b4[3].native, b4_4.native, srcMisalign*8); } if (Multimem) multimem_st_global<16>(dstAddr, b16); else st_global<16>(dstAddr, b16); diff --git a/src/device/prims_simple.h b/src/device/prims_simple.h index cf3ba9b55..2ad965bf7 100644 --- a/src/device/prims_simple.h +++ b/src/device/prims_simple.h @@ -125,7 +125,7 @@ class Primitives< void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst) : (ncclShmem.groups[group].srcs + Src); - if (flags & NetRegMode) { + if ((flags & NetRegMode) && ((!isSendNotRecv && DirectRecv) || (isSendNotRecv && DirectSend))) { if (P2p) { ptrs[index] = NULL; } else { @@ -337,7 +337,7 @@ class Primitives< } template - __device__ __forceinline__ void process(Fn &&fn, uint32_t sendDirectFlag, uint32_t recvDirectFlag) { + __device__ __forceinline__ void process(Fn &&fn, uint32_t sendDirectFlag = 0, uint32_t recvDirectFlag = 0) { #pragma unroll 1 for (int slice=0; slice < SlicePerChunk; slice++) { if (tid < nworkers) { @@ -361,7 +361,7 @@ class Primitives< } else if (flags & DirectRead) { // empty send ptrs[index] = nullptr; } else { - ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize; + ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize; } } else { if (flags & DirectRead) { @@ -372,11 +372,11 @@ class Primitives< else ptrs[index] = nullptr; } else { - ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize; + ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize; } } } else { - ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize; + ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize; } } subBarrier(); @@ -391,7 +391,7 @@ class Primitives< } else { nsend = fan.nsend(); } - fn.template operator() < SlicePerChunk, 0, Recv*MaxRecv, 0, Send*MaxSend > + fn.template operator() (tid, nworkers, slice, stepSize * StepPerSlice, nrecv, ncclShmem.groups[group].srcs, nsend, ncclShmem.groups[group].dsts, ncclShmem.groups[group].dstSizes, sendDirectFlag, recvDirectFlag); @@ -896,6 +896,12 @@ class Primitives< __device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, postOp); } + __device__ __forceinline__ void recvDirectSend(intptr_t outIx, int eltN, bool postOp=false) { + genericOp<0, 1, 1, 1, -1, -1>(-1, outIx, eltN, postOp); + } + __device__ __forceinline__ void directRecvSend(intptr_t outIx, int eltN, bool postOp=false) { + genericOp<1, 0, 1, 1, -1, -1>(outIx, outIx, eltN, postOp); + } __device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp); } diff --git a/src/device/reduce_kernel.h b/src/device/reduce_kernel.h index c2378e3df..0d054bb2d 100644 --- a/src/device/reduce_kernel.h +++ b/src/device/reduce_kernel.h @@ -38,18 +38,18 @@ struct IsFloatingPoint: std::true_type {}; // 3. Have constructor taking `uint64_t opArg`. template -struct FuncCopy { using EltType = T; __device__ FuncCopy(uint64_t opArg=0) {}; }; +struct FuncCopy { using EltType = T; __device__ __forceinline__ FuncCopy(uint64_t opArg=0) {}; }; template -struct FuncSum { using EltType = T; __device__ FuncSum(uint64_t opArg=0) {}; }; +struct FuncSum { using EltType = T; __device__ __forceinline__ FuncSum(uint64_t opArg=0) {}; }; template -struct FuncProd { using EltType = T; __device__ FuncProd(uint64_t opArg=0) {}; }; +struct FuncProd { using EltType = T; __device__ __forceinline__ FuncProd(uint64_t opArg=0) {}; }; template struct FuncMinMax { using EltType = T; BytePack xormask; // only used by integers bool isMinNotMax; // only used by floats - __device__ FuncMinMax(uint64_t opArg=0) { + __device__ __forceinline__ FuncMinMax(uint64_t opArg=0) { xormask.native = opArg; isMinNotMax = (opArg&1)==0; } @@ -64,13 +64,13 @@ template struct FuncSumPostDiv; template struct RedOpArg { // default case: no argument static constexpr bool ArgUsed = false; - __device__ static uint64_t loadArg(void *ptr) { return 0; } + __device__ __forceinline__ static uint64_t loadArg(void *ptr) { return 0; } }; template struct RedOpArg> { static constexpr bool ArgUsed = true; - __device__ static uint64_t loadArg(void *ptr) { + __device__ __forceinline__ static uint64_t loadArg(void *ptr) { union { uint64_t u64; T val; }; u64 = 0; val = *(T*)ptr; @@ -84,6 +84,11 @@ struct RedOpArg> { // of elements. These classes are intended to be specialized for specific // combinations of reduction function and pack size. +template +struct Apply_Cast/*{ + static BytePack cast(BytePack a); +}*/; + template struct Apply_Reduce /*{ static BytePack reduce( @@ -111,16 +116,60 @@ struct Apply_LoadMultimem/*{ static BytePack load(Fn fn, uintptr_t addr); }*/; + +// Helpers for dealing with BytePack<0>'s +template +struct Apply_Cast_MaybeEmpty: Apply_Cast {}; +template +struct Apply_Cast_MaybeEmpty { + __device__ constexpr static BytePack<0> cast(BytePack<0> a) { return {}; } +}; + +template +struct Apply_Reduce_MaybeEmpty: Apply_Reduce {}; +template +struct Apply_Reduce_MaybeEmpty { + __device__ constexpr static BytePack<0> reduce(Fn fn, BytePack<0> a, BytePack<0> b) { return {}; } +}; + +template +struct Apply_PreOp_MaybeEmpty: Apply_PreOp {}; +template +struct Apply_PreOp_MaybeEmpty { + static constexpr bool IsIdentity = true; + __device__ constexpr static BytePack<0> preOp(Fn fn, BytePack<0> a) { return {}; } +}; + +template +struct Apply_PostOp_MaybeEmpty: Apply_PostOp {}; +template +struct Apply_PostOp_MaybeEmpty { + static constexpr bool IsIdentity = true; + __device__ constexpr static BytePack<0> postOp(Fn fn, BytePack<0> a) { return {}; } +}; + +template +struct Apply_LoadMultimem_MaybeEmpty: Apply_LoadMultimem {}; +template +struct Apply_LoadMultimem_MaybeEmpty { + __device__ constexpr static BytePack<0> load(Fn fn, uintptr_t addr) { return {}; } +}; + //////////////////////////////////////////////////////////////////////////////// // Public API for calling the trait classes. These take the data elements as a // pack of any type, which could be a BytePack or any integral type (uint64_t, // uint32_t, etc.), and will return a new pack where each element has been // transformed appropriately. +template +__device__ __forceinline__ BytePack::Size*sizeof(B)/sizeof(A)> applyCast(PackA a) { + return Apply_Cast_MaybeEmpty::Size/sizeof(A)>::cast(toPack(a)); +} + template __device__ __forceinline__ Pack applyReduce(Fn fn, Pack a, Pack b) { return fromPack( - Apply_Reduce::Size/sizeof(typename Fn::EltType)> + Apply_Reduce_MaybeEmpty::Size/sizeof(typename Fn::EltType)> ::reduce(fn, toPack(a), toPack(b)) ); } @@ -128,7 +177,7 @@ __device__ __forceinline__ Pack applyReduce(Fn fn, Pack a, Pack b) { template __device__ __forceinline__ Pack applyPreOp(Fn fn, Pack a) { return fromPack( - Apply_PreOp::Size/sizeof(typename Fn::EltType)> + Apply_PreOp_MaybeEmpty::Size/sizeof(typename Fn::EltType)> ::preOp(fn, toPack(a)) ); } @@ -136,23 +185,107 @@ __device__ __forceinline__ Pack applyPreOp(Fn fn, Pack a) { template __device__ __forceinline__ Pack applyPostOp(Fn fn, Pack a) { return fromPack( - Apply_PostOp::Size/sizeof(typename Fn::EltType)> + Apply_PostOp_MaybeEmpty::Size/sizeof(typename Fn::EltType)> ::postOp(fn, toPack(a)) ); } template __device__ __forceinline__ BytePack applyLoadMultimem(Fn fn, uintptr_t addr) { - return Apply_LoadMultimem::load(fn, addr); + return Apply_LoadMultimem_MaybeEmpty::load(fn, addr); } +//////////////////////////////////////////////////////////////////////////////// +// Apply_Cast + +template +struct Apply_Cast { + __device__ __forceinline__ static BytePack cast(BytePack a) { + BytePack b; + b.half[0] = Apply_Cast::cast(a.half[0]); + b.half[1] = Apply_Cast::cast(a.half[1]); + return b; + } +}; + +template +struct Apply_Cast { + __device__ __forceinline__ static BytePack cast(BytePack a) { + return toPack(B(fromPack(a))); + } +}; + +template<> +struct Apply_Cast<__half, float, /*EltPerPack=*/1> { + __device__ __forceinline__ static BytePack cast(BytePack a) { + return toPack(__half2float(fromPack<__half>(a))); + } +}; +template<> +struct Apply_Cast { + __device__ __forceinline__ static BytePack cast(BytePack a) { + return toPack(__float2half_rn(fromPack(a))); + } +}; + +template<> +struct Apply_Cast<__half, float, /*EltPerPack=*/2> { + __device__ __forceinline__ static BytePack<4*2> cast(BytePack<2*2> a) { + return toPack(__half22float2(fromPack<__half2>(a))); + } +}; +template<> +struct Apply_Cast { + __device__ __forceinline__ static BytePack<2*2> cast(BytePack<4*2> a) { + return toPack(__float22half2_rn(fromPack(a))); + } +}; + +#if defined(__CUDA_BF16_TYPES_EXIST__) && (CUDART_RUNTIME >= 12000 || __CUDA_ARCH__ >= 800) +template<> +struct Apply_Cast<__nv_bfloat16, float, /*EltPerPack=*/2> { + __device__ __forceinline__ static BytePack<4*2> cast(BytePack<2*2> a) { + return toPack(__bfloat1622float2(fromPack<__nv_bfloat162>(a))); + } +}; +template<> +struct Apply_Cast { + __device__ __forceinline__ static BytePack<2*2> cast(BytePack<4*2> a) { + return toPack(__float22bfloat162_rn(fromPack(a))); + } +}; +#endif + +#define EASY_CAST(A, B, EltPerPack, VecA, VecB) \ + template<> \ + struct Apply_Cast { \ + __device__ __forceinline__ static BytePack cast(BytePack a) { \ + return toPack(VecB(fromPack(a))); \ + } \ + }; \ + template<> \ + struct Apply_Cast { \ + __device__ __forceinline__ static BytePack cast(BytePack b) { \ + return toPack(VecA(fromPack(b))); \ + } \ + }; + +#if defined(__CUDA_FP8_TYPES_EXIST__) +EASY_CAST(__nv_fp8_e5m2, float, 2, __nv_fp8x2_e5m2, float2) +EASY_CAST(__nv_fp8_e5m2, float, 4, __nv_fp8x4_e5m2, float4) + +EASY_CAST(__nv_fp8_e4m3, float, 2, __nv_fp8x2_e4m3, float2) +EASY_CAST(__nv_fp8_e4m3, float, 4, __nv_fp8x4_e4m3, float4) +#endif +#undef EASY_CAST + //////////////////////////////////////////////////////////////////////////////// // Apply_Reduce // Nonsensical base case template struct Apply_Reduce { - __device__ static BytePack<0> reduce(Fn fn, BytePack<0> a, BytePack<0> b) { + __device__ __forceinline__ static BytePack<0> reduce(Fn fn, BytePack<0> a, BytePack<0> b) { return {}; } }; @@ -164,7 +297,7 @@ struct Apply_Reduce { template struct Apply_Reduce { template - __device__ static BytePack reduce(Fn fn, BytePack a, BytePack b) { + __device__ __forceinline__ static BytePack reduce(Fn fn, BytePack a, BytePack b) { a.half[0] = Apply_Reduce::reduce(fn, a.half[0], b.half[0]); a.half[1] = Apply_Reduce::reduce(fn, a.half[1], b.half[1]); return a; @@ -174,25 +307,25 @@ struct Apply_Reduce { // Base case definitions (EltPerPack == 1) template struct Apply_Reduce, /*EltPerPack=*/1> { - __device__ static BytePack reduce(FuncCopy fn, BytePack a, BytePack b) { + __device__ __forceinline__ static BytePack reduce(FuncCopy fn, BytePack a, BytePack b) { return a; } }; template struct Apply_Reduce, /*EltPerPack=*/1> { - __device__ static BytePack reduce(FuncSum fn, BytePack a, BytePack b) { + __device__ __forceinline__ static BytePack reduce(FuncSum fn, BytePack a, BytePack b) { return toPack(fromPack(a) + fromPack(b)); } }; template struct Apply_Reduce, /*EltPerPack=*/1> { - __device__ static BytePack reduce(FuncProd fn, BytePack a, BytePack b) { + __device__ __forceinline__ static BytePack reduce(FuncProd fn, BytePack a, BytePack b) { return toPack(fromPack(a) * fromPack(b)); } }; template struct Apply_Reduce, /*EltPerPack=*/1> { - __device__ static BytePack reduce(FuncMinMax fn, BytePack a, BytePack b) { + __device__ __forceinline__ static BytePack reduce(FuncMinMax fn, BytePack a, BytePack b) { return (a.native ^ fn.xormask.native) < (b.native ^ fn.xormask.native) ? a : b; } }; @@ -200,7 +333,7 @@ struct Apply_Reduce, /*EltPerPack=*/1> { // Optimizations for specfic types and element count combinations: template<> struct Apply_Reduce, /*EltPerPack=*/4> { - __device__ static BytePack<4> reduce(FuncSum fn, BytePack<4> a, BytePack<4> b) { + __device__ __forceinline__ static BytePack<4> reduce(FuncSum fn, BytePack<4> a, BytePack<4> b) { constexpr uint32_t even = 0x00ff00ffu; uint32_t x = (a.native & even) + (b.native & even); uint32_t y = (a.native & ~even) + (b.native & ~even); @@ -236,7 +369,7 @@ struct Apply_Reduce, /*EltPerPack=*/4> { template<> struct Apply_Reduce, /*EltPerPack=*/4> { - __device__ static BytePack<4> reduce(FuncProd fn, BytePack<4> apack, BytePack<4> bpack) { + __device__ __forceinline__ static BytePack<4> reduce(FuncProd fn, BytePack<4> apack, BytePack<4> bpack) { uint32_t a = apack.native; uint32_t b = bpack.native; uint32_t ab0 = (a*b) & 0xffu; @@ -332,7 +465,7 @@ template struct Apply_PreOp { static constexpr bool IsIdentity = Apply_PreOp::IsIdentity; template - __device__ static BytePack preOp(Fn fn, BytePack a) { + __device__ __forceinline__ static BytePack preOp(Fn fn, BytePack a) { #if __cpp_if_constexpr if constexpr(!IsIdentity) { #else @@ -352,7 +485,7 @@ template struct Apply_PreOp { static constexpr bool IsIdentity = true; template - __device__ static BytePack preOp(Fn fn, BytePack a) { + __device__ __forceinline__ static BytePack preOp(Fn fn, BytePack a) { return a; } }; @@ -360,7 +493,7 @@ struct Apply_PreOp { template struct Apply_PreOp { static constexpr bool IsIdentity = true; - __device__ static BytePack<0> preOp(Fn fn, BytePack<0> a) { + __device__ __forceinline__ static BytePack<0> preOp(Fn fn, BytePack<0> a) { return {}; } }; @@ -373,7 +506,7 @@ template struct Apply_PostOp { static constexpr bool IsIdentity = Apply_PostOp::IsIdentity; template - __device__ static BytePack postOp(Fn fn, BytePack a) { + __device__ __forceinline__ static BytePack postOp(Fn fn, BytePack a) { #if __cpp_if_constexpr if constexpr(!IsIdentity) { #else @@ -393,7 +526,7 @@ template struct Apply_PostOp { static constexpr bool IsIdentity = true; template - __device__ static BytePack postOp(Fn fn, BytePack a) { + __device__ __forceinline__ static BytePack postOp(Fn fn, BytePack a) { return a; } }; @@ -401,7 +534,7 @@ struct Apply_PostOp { template struct Apply_PostOp { static constexpr bool IsIdentity = true; - __device__ static BytePack<0> postOp(Fn fn, BytePack<0> a) { + __device__ __forceinline__ static BytePack<0> postOp(Fn fn, BytePack<0> a) { return {}; } }; @@ -413,7 +546,7 @@ struct Apply_PostOp { template struct RedOpArg> { static constexpr bool ArgUsed = true; - __device__ static uint64_t loadArg(void *ptr) { + __device__ __forceinline__ static uint64_t loadArg(void *ptr) { union { uint64_t u64; T val; }; u64 = 0; val = *(T*)ptr; @@ -426,7 +559,7 @@ template struct FuncPreMulSum { using EltType = T; T scalar; - __device__ FuncPreMulSum(uint64_t opArg=0) { + __device__ __forceinline__ FuncPreMulSum(uint64_t opArg=0) { union { uint64_t u64; T val; }; u64 = opArg; scalar = val; @@ -441,7 +574,7 @@ struct FuncPreMulSum { using EltType = half; #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610 __half2 scalar; - __device__ FuncPreMulSum(uint64_t opArg=0) { + __device__ __forceinline__ FuncPreMulSum(uint64_t opArg=0) { union { uint64_t u64; __half val; }; u64 = opArg; scalar.x = val; @@ -449,7 +582,7 @@ struct FuncPreMulSum { } #else float scalar; - __device__ FuncPreMulSum(uint64_t opArg=0) { + __device__ __forceinline__ FuncPreMulSum(uint64_t opArg=0) { union { uint64_t u64; __half val; }; u64 = opArg; scalar = (float)val; @@ -466,7 +599,7 @@ struct FuncPreMulSum { using EltType = __nv_bfloat16; #if __CUDA_ARCH__ >= 800 __nv_bfloat162 scalar; - __device__ FuncPreMulSum(uint64_t opArg=0) { + __device__ __forceinline__ FuncPreMulSum(uint64_t opArg=0) { union { uint64_t u64; __nv_bfloat16 val; }; u64 = opArg; scalar.x = val; @@ -474,7 +607,7 @@ struct FuncPreMulSum { } #else float scalar; - __device__ FuncPreMulSum(uint64_t opArg=0) { + __device__ __forceinline__ FuncPreMulSum(uint64_t opArg=0) { union { uint64_t u64; __nv_bfloat16 val; }; u64 = opArg; scalar = __bfloat162float(val); @@ -489,7 +622,7 @@ struct FuncPreMulSum { struct FuncPreMulSum<__nv_fp8_e4m3> { using EltType = __nv_fp8_e4m3; __half2 scalar2; - __device__ FuncPreMulSum(uint64_t opArg) { + __device__ __forceinline__ FuncPreMulSum(uint64_t opArg) { union { uint64_t u64; __nv_fp8_storage_t val; }; u64 = opArg; scalar2.x = __half(__nv_cvt_fp8_to_halfraw(val, __NV_E4M3)); @@ -501,7 +634,7 @@ struct FuncPreMulSum { struct FuncPreMulSum<__nv_fp8_e5m2> { using EltType = __nv_fp8_e5m2; __half2 scalar2; - __device__ FuncPreMulSum(uint64_t opArg) { + __device__ __forceinline__ FuncPreMulSum(uint64_t opArg) { union { uint64_t u64; __nv_fp8_storage_t val; }; u64 = opArg; scalar2.x = __half(__nv_cvt_fp8_to_halfraw(val, __NV_E5M2)); @@ -513,7 +646,7 @@ struct FuncPreMulSum { template struct Apply_Reduce, EltPerPack> { - __device__ static BytePack reduce(FuncPreMulSum fn, BytePack a, BytePack b) { + __device__ __forceinline__ static BytePack reduce(FuncPreMulSum fn, BytePack a, BytePack b) { // FuncPreMulSum reduce dispatches to FuncSum. return Apply_Reduce, EltPerPack>::reduce(FuncSum(), a, b); } @@ -523,7 +656,7 @@ struct Apply_Reduce, EltPerPack> { template struct Apply_PreOp, /*EltPerPack=*/1> { static constexpr bool IsIdentity = false; - __device__ static BytePack preOp(FuncPreMulSum fn, BytePack a) { + __device__ __forceinline__ static BytePack preOp(FuncPreMulSum fn, BytePack a) { return toPack(fromPack(a) * fn.scalar); } }; @@ -534,7 +667,7 @@ struct Apply_PreOp, /*EltPerPack=*/1> { template<> struct Apply_PreOp, /*EltPerPack=*/1> { static constexpr bool IsIdentity = false; - __device__ static BytePack preOp(FuncPreMulSum fn, BytePack a) { + __device__ __forceinline__ static BytePack preOp(FuncPreMulSum fn, BytePack a) { #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610 return toPack(__hmul(fromPack(a), fn.scalar.x)); #else @@ -546,7 +679,7 @@ struct Apply_PreOp, /*EltPerPack=*/1> { template<> struct Apply_PreOp, /*EltPerPack=*/2> { static constexpr bool IsIdentity = false; - __device__ static BytePack preOp(FuncPreMulSum fn, BytePack a) { + __device__ __forceinline__ static BytePack preOp(FuncPreMulSum fn, BytePack a) { return toPack(__hmul2(fromPack(a), fn.scalar)); } }; @@ -559,7 +692,7 @@ struct Apply_PreOp, /*EltPerPack=*/1> { template<> struct Apply_PreOp, /*EltPerPack=*/1> { static constexpr bool IsIdentity = false; - __device__ static BytePack preOp( + __device__ __forceinline__ static BytePack preOp( FuncPreMulSum<__nv_bfloat16> fn, BytePack a ) { #if __CUDA_ARCH__ >= 800 @@ -573,7 +706,7 @@ struct Apply_PreOp, /*EltPerPack=*/1> { template<> struct Apply_PreOp, /*EltPerPack=*/2> { static constexpr bool IsIdentity = false; - __device__ static BytePack preOp( + __device__ __forceinline__ static BytePack preOp( FuncPreMulSum<__nv_bfloat16> fn, BytePack a ) { return toPack<__nv_bfloat162>(__hmul2(fromPack<__nv_bfloat162>(a), fn.scalar)); @@ -590,7 +723,7 @@ struct Apply_PreOp, /*EltPerPack=*/1> { template<> struct Apply_PreOp, /*EltPerPack=*/1> { static constexpr bool IsIdentity = false; - __device__ static BytePack preOp( + __device__ __forceinline__ static BytePack preOp( FuncPreMulSum<__nv_fp8_e4m3> fn, BytePack a ) { return toPack<__nv_fp8_e4m3>(__nv_fp8_e4m3(__hmul(__half(fromPack<__nv_fp8_e4m3>(a)), fn.scalar2.x))); @@ -599,7 +732,7 @@ struct Apply_PreOp, /*EltPerPack=*/1> { template<> struct Apply_PreOp, /*EltPerPack=*/2> { static constexpr bool IsIdentity = false; - __device__ static BytePack preOp( + __device__ __forceinline__ static BytePack preOp( FuncPreMulSum<__nv_fp8_e4m3> fn, BytePack a ) { return toPack<__nv_fp8x2_e4m3>(__nv_fp8x2_e4m3(__hmul2(__half2(fromPack<__nv_fp8x2_e4m3>(a)), fn.scalar2))); @@ -609,7 +742,7 @@ struct Apply_PreOp, /*EltPerPack=*/1> { template<> struct Apply_PreOp, /*EltPerPack=*/1> { static constexpr bool IsIdentity = false; - __device__ static BytePack preOp( + __device__ __forceinline__ static BytePack preOp( FuncPreMulSum<__nv_fp8_e5m2> fn, BytePack a ) { return toPack<__nv_fp8_e5m2>(__nv_fp8_e5m2(__hmul(__half(fromPack<__nv_fp8_e5m2>(a)), fn.scalar2.x))); @@ -618,7 +751,7 @@ struct Apply_PreOp, /*EltPerPack=*/1> { template<> struct Apply_PreOp, /*EltPerPack=*/2> { static constexpr bool IsIdentity = false; - __device__ static BytePack preOp( + __device__ __forceinline__ static BytePack preOp( FuncPreMulSum<__nv_fp8_e5m2> fn, BytePack a ) { return toPack<__nv_fp8x2_e5m2>(__nv_fp8x2_e5m2(__hmul2(__half2(fromPack<__nv_fp8x2_e5m2>(a)), fn.scalar2))); @@ -633,7 +766,7 @@ struct Apply_PreOp, /*EltPerPack=*/1> { template struct RedOpArg> { static constexpr bool ArgUsed = true; - __device__ static uint64_t loadArg(void *ptr) { + __device__ __forceinline__ static uint64_t loadArg(void *ptr) { return *(uint64_t*)ptr; } }; @@ -646,12 +779,12 @@ struct FuncSumPostDiv { uint32_t divisor:31, isSigned:1; UintType recip; - __device__ FuncSumPostDiv(uint64_t opArg=0) { + __device__ __forceinline__ FuncSumPostDiv(uint64_t opArg=0) { isSigned = opArg & 1; divisor = opArg >> 1; recip = UintType(-1)/divisor; } - __device__ T divide(T x) { + __device__ __forceinline__ T divide(T x) { // x is negative iff we are in signed mode and the top bit is set bool xneg = isSigned && (x & ~(T(-1)>>1)); // Compute abs(x): @@ -673,7 +806,7 @@ struct FuncSumPostDiv { template struct Apply_Reduce, EltPerPack>: Apply_Reduce, EltPerPack> { - __device__ static BytePack reduce(FuncSumPostDiv fn, BytePack a, BytePack b) { + __device__ __forceinline__ static BytePack reduce(FuncSumPostDiv fn, BytePack a, BytePack b) { // FuncSumPostDiv reduce dispatches to FuncSum. return Apply_Reduce, EltPerPack>::reduce(FuncSum(), a, b); } @@ -682,7 +815,7 @@ struct Apply_Reduce, EltPerPack>: template struct Apply_PostOp, /*EltPerPack=*/1> { static constexpr bool IsIdentity = false; - __device__ static BytePack postOp(FuncSumPostDiv fn, BytePack a) { + __device__ __forceinline__ static BytePack postOp(FuncSumPostDiv fn, BytePack a) { return toPack(fn.divide(fromPack(a))); } }; @@ -690,120 +823,145 @@ struct Apply_PostOp, /*EltPerPack=*/1> { //////////////////////////////////////////////////////////////////////////////// // Apply_LoadMultimem -#define SIZEOF_BytePack_field_u16 2 -#define PTX_REG_BytePack_field_u16 "h" - -#define SIZEOF_BytePack_field_u32 4 -#define PTX_REG_BytePack_field_u32 "r" - -#define SIZEOF_BytePack_field_u64 8 -#define PTX_REG_BytePack_field_u64 "l" +#define RegCode_for_size_1 "r" +#define RegCode_for_size_2 "h" +#define RegCode_for_size_4 "r" +#define RegCode_for_size_8 "l" + +#define RegSize_for_size_1 4 +#define RegSize_for_size_2 2 +#define RegSize_for_size_4 4 +#define RegSize_for_size_8 8 + +#define PtxAcc_for_u32 +#define PtxAcc_for_s32 +#define PtxAcc_for_s64 +#define PtxAcc_for_u64 +#define PtxAcc_for_f32 +#define PtxAcc_for_f64 +#if CUDART_VERSION >= 12020 + #define PtxAcc_for_f16 ".acc::f32" + #define PtxAcc_for_bf16 ".acc::f32" + #define PtxAcc_for_f16x2 ".acc::f32" + #define PtxAcc_for_bf16x2 ".acc::f32" +#else + #define PtxAcc_for_f16 + #define PtxAcc_for_bf16 + #define PtxAcc_for_f16x2 + #define PtxAcc_for_bf16x2 +#endif +#define PtxAcc_for_e4m3 ".acc::f16" +#define PtxAcc_for_e5m2 ".acc::f16" +#define PtxAcc_for_e4m3x4 ".acc::f16" +#define PtxAcc_for_e5m2x4 ".acc::f16" -#define DEFINE_Apply_LoadMultimem_sum(T, ptx_ty, pack_field) \ +#define DEFINE_Apply_LoadMultimem_sum(T, ptx_ty, PackSize) \ template<> \ - struct Apply_LoadMultimem, SIZEOF_BytePack_field_##pack_field> { \ - static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \ - __device__ static BytePack load(FuncSum fn, uintptr_t addr) { \ - BytePack ans; \ - asm volatile("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \ - : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \ + struct Apply_LoadMultimem, PackSize> { \ + __device__ __forceinline__ static BytePack load(FuncSum fn, uintptr_t addr) { \ + BytePack reg; \ + asm volatile("multimem.ld_reduce.relaxed.sys.global.add" PtxAcc_for_##ptx_ty "." #ptx_ty " %0, [%1];" \ + : "=" RegCode_for_size_##PackSize(reg.native) \ : "l"(addr) : "memory"); \ + BytePack ans; \ + ans.native = reg.native; \ return ans; \ } \ }; -#define DEFINE_Apply_LoadMultimem_minmax(T, ptx_ty, pack_field) \ +#define DEFINE_Apply_LoadMultimem_minmax(T, ptx_ty, PackSize) \ template<> \ - struct Apply_LoadMultimem, SIZEOF_BytePack_field_##pack_field> { \ - static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \ - __device__ static BytePack load(FuncMinMax fn, uintptr_t addr) { \ - BytePack ans; \ + struct Apply_LoadMultimem, PackSize> { \ + __device__ __forceinline__ static BytePack load(FuncMinMax fn, uintptr_t addr) { \ + BytePack reg; \ if (fn.isMinNotMax) { \ asm volatile("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \ - : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \ + : "=" RegCode_for_size_##PackSize(reg.native) \ : "l"(addr) : "memory"); \ } else { \ asm volatile("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \ - : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \ + : "=" RegCode_for_size_##PackSize(reg.native) \ : "l"(addr) : "memory"); \ } \ + BytePack ans; \ + ans.native = reg.native; \ return ans; \ } \ }; -#define DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, pack_field) \ +#define DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, VecEltSize) \ template<> \ - struct Apply_LoadMultimem, 4*(SIZEOF_BytePack_field_##pack_field)> { \ - static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \ - __device__ static BytePack load(FuncSum fn, uintptr_t addr) { \ - BytePack ans; \ - asm volatile("multimem.ld_reduce.relaxed.sys.global.add.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \ - : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \ - "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \ - "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \ - "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \ + struct Apply_LoadMultimem, 4*(VecEltSize)> { \ + static constexpr int PackSize = 4*(VecEltSize); \ + __device__ __forceinline__ static BytePack load(FuncSum fn, uintptr_t addr) { \ + union { BytePack ans; BytePack elts[4]; }; \ + asm volatile("multimem.ld_reduce.relaxed.sys.global.add" PtxAcc_for_##ptx_ty ".v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \ + : "=" RegCode_for_size_##VecEltSize(elts[0].native), \ + "=" RegCode_for_size_##VecEltSize(elts[1].native), \ + "=" RegCode_for_size_##VecEltSize(elts[2].native), \ + "=" RegCode_for_size_##VecEltSize(elts[3].native) \ : "l"(addr) : "memory"); \ return ans; \ } \ }; -#define DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, pack_field) \ +#define DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, VecEltSize) \ template<> \ - struct Apply_LoadMultimem, 4*(SIZEOF_BytePack_field_##pack_field)> { \ - static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \ - __device__ static BytePack load(FuncMinMax fn, uintptr_t addr) { \ - BytePack ans; \ + struct Apply_LoadMultimem, 4*(VecEltSize)> { \ + static constexpr int PackSize = 4*(VecEltSize); \ + __device__ __forceinline__ static BytePack load(FuncMinMax fn, uintptr_t addr) { \ + union { BytePack ans; BytePack elts[4]; }; \ if (fn.isMinNotMax) { \ asm volatile("multimem.ld_reduce.relaxed.sys.global.min.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \ - : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \ - "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \ - "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \ - "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \ + : "=" RegCode_for_size_##VecEltSize(elts[0].native), \ + "=" RegCode_for_size_##VecEltSize(elts[1].native), \ + "=" RegCode_for_size_##VecEltSize(elts[2].native), \ + "=" RegCode_for_size_##VecEltSize(elts[3].native) \ : "l"(addr) : "memory"); \ } else { \ asm volatile("multimem.ld_reduce.relaxed.sys.global.max.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \ - : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \ - "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \ - "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \ - "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \ + : "=" RegCode_for_size_##VecEltSize(elts[0].native), \ + "=" RegCode_for_size_##VecEltSize(elts[1].native), \ + "=" RegCode_for_size_##VecEltSize(elts[2].native), \ + "=" RegCode_for_size_##VecEltSize(elts[3].native) \ : "l"(addr) : "memory"); \ } \ return ans; \ } \ }; -#define DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(T, ptx_ty, pack_field) \ - DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, pack_field) \ +#define DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(T, ptx_ty, VecEltSize) \ + DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, VecEltSize) \ template<> \ struct Apply_LoadMultimem, sizeof(T)> { \ - __device__ static BytePack load(FuncSum fn, uintptr_t addr) { \ - BytePack<2*sizeof(T)> tmp; \ - asm volatile("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \ - : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \ - : "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \ - return tmp.half[(addr/sizeof(T))%2]; \ + __device__ __forceinline__ static BytePack load(FuncSum fn, uintptr_t addr) { \ + union { BytePack tmp; BytePack elts[(VecEltSize)/sizeof(T)]; }; \ + asm volatile("multimem.ld_reduce.relaxed.sys.global.add" PtxAcc_for_##ptx_ty "." #ptx_ty " %0, [%1];" \ + : "=" RegCode_for_size_##VecEltSize(tmp.native) \ + : "l"(addr & -uintptr_t(VecEltSize)) : "memory"); \ + return elts[(addr/sizeof(T))%((VecEltSize)/sizeof(T))]; \ } \ }; -#define DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(T, ptx_ty, pack_field) \ - DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, pack_field) \ +#define DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(T, ptx_ty, VecEltSize) \ + DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, VecEltSize) \ template<> \ struct Apply_LoadMultimem, sizeof(T)> { \ - __device__ static BytePack load(FuncMinMax fn, uintptr_t addr) { \ - BytePack<2*sizeof(T)> tmp; \ + __device__ __forceinline__ static BytePack load(FuncMinMax fn, uintptr_t addr) { \ + union { BytePack tmp; BytePack elts[(VecEltSize)/sizeof(T)]; }; \ if (fn.isMinNotMax) { \ asm volatile("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \ - : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \ - : "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \ + : "=" RegCode_for_size_##VecEltSize(tmp.native) \ + : "l"(addr & -uintptr_t(VecEltSize)) : "memory"); \ } else { \ asm volatile("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \ - : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \ - : "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \ + : "=" RegCode_for_size_##VecEltSize(tmp.native) \ + : "l"(addr & -uintptr_t(VecEltSize)) : "memory"); \ } \ - return tmp.half[(addr/sizeof(T))%2]; \ + return elts[(addr/sizeof(T))%((VecEltSize)/sizeof(T))]; \ } \ }; template struct Apply_LoadMultimem { - __device__ static BytePack load(Fn fn, uintptr_t addr) { + __device__ __forceinline__ static BytePack load(Fn fn, uintptr_t addr) { __trap(); return {}; } @@ -826,29 +984,36 @@ struct Apply_LoadMultimem { /*multimem.ld_reduce not supported:*/ 0; }; - DEFINE_Apply_LoadMultimem_sum(uint32_t, u32, u32) - DEFINE_Apply_LoadMultimem_minmax(uint32_t, u32, u32) + DEFINE_Apply_LoadMultimem_sum(uint32_t, u32, 4) + DEFINE_Apply_LoadMultimem_minmax(uint32_t, u32, 4) - DEFINE_Apply_LoadMultimem_sum(int32_t, s32, u32) - DEFINE_Apply_LoadMultimem_minmax(int32_t, s32, u32) + DEFINE_Apply_LoadMultimem_sum(int32_t, s32, 4) + DEFINE_Apply_LoadMultimem_minmax(int32_t, s32, 4) - DEFINE_Apply_LoadMultimem_sum(uint64_t, u64, u64) - DEFINE_Apply_LoadMultimem_minmax(uint64_t, u64, u64) + DEFINE_Apply_LoadMultimem_sum(uint64_t, u64, 8) + DEFINE_Apply_LoadMultimem_minmax(uint64_t, u64, 8) - DEFINE_Apply_LoadMultimem_sum(int64_t, u64, u64) - DEFINE_Apply_LoadMultimem_minmax(int64_t, s64, u64) + DEFINE_Apply_LoadMultimem_sum(int64_t, u64, 8) + DEFINE_Apply_LoadMultimem_minmax(int64_t, s64, 8) - DEFINE_Apply_LoadMultimem_sum(float, f32, u32) - DEFINE_Apply_LoadMultimem_sum_v4(float, f32, u32) + DEFINE_Apply_LoadMultimem_sum(float, f32, 4) + DEFINE_Apply_LoadMultimem_sum_v4(float, f32, 4) - DEFINE_Apply_LoadMultimem_sum(double, f64, u64) + DEFINE_Apply_LoadMultimem_sum(double, f64, 8) - DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(half, f16x2, u32) - DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(half, f16x2, u32) + DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(half, f16x2, 4) + DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(half, f16x2, 4) #if defined(__CUDA_BF16_TYPES_EXIST__) - DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(__nv_bfloat16, bf16x2, u32) - DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(__nv_bfloat16, bf16x2, u32) + DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(__nv_bfloat16, bf16x2, 4) + DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(__nv_bfloat16, bf16x2, 4) + #endif + + #if NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1000 || NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1010 || NCCL_CUDA_ARCH_SPECIFIC == 1200 || NCCL_CUDA_ARCH_SPECIFIC == 1210 + DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(__nv_fp8_e4m3, e4m3x4, 4) + DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(__nv_fp8_e4m3, e4m3x4, 4) + DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(__nv_fp8_e5m2, e5m2x4, 4) + DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(__nv_fp8_e5m2, e5m2x4, 4) #endif #else template @@ -860,11 +1025,29 @@ struct Apply_LoadMultimem { #undef DEFINE_Apply_LoadMultimem #undef DEFINE_Apply_LoadMultimem_v4 #undef DEFINE_Apply_LoadMultimem_v4x2_and_subhalf -#undef SIZEOF_BytePack_field_u64 -#undef PTX_REG_BytePack_field_u64 -#undef SIZEOF_BytePack_field_u32 -#undef PTX_REG_BytePack_field_u32 -#undef SIZEOF_BytePack_field_u16 -#undef PTX_REG_BytePack_field_u16 + +#undef RegCode_for_size_2 +#undef RegCode_for_size_4 +#undef RegCode_for_size_8 + +#undef RegSize_for_size_1 +#undef RegSize_for_size_2 +#undef RegSize_for_size_4 +#undef RegSize_for_size_8 + +#undef PtxAcc_for_u32 +#undef PtxAcc_for_s32 +#undef PtxAcc_for_s64 +#undef PtxAcc_for_u64 +#undef PtxAcc_for_f32 +#undef PtxAcc_for_f64 +#undef PtxAcc_for_f16 +#undef PtxAcc_for_bf16 +#undef PtxAcc_for_f16x2 +#undef PtxAcc_for_bf16x2 +#undef PtxAcc_for_e4m3 +#undef PtxAcc_for_e5m2 +#undef PtxAcc_for_e4m3x4 +#undef PtxAcc_for_e5m2x4 #endif // REDUCE_KERNEL_H_ diff --git a/src/device/reduce_scatter.h b/src/device/reduce_scatter.h index 5d8de2819..63b981b09 100644 --- a/src/device/reduce_scatter.h +++ b/src/device/reduce_scatter.h @@ -142,82 +142,206 @@ struct RunWorkColl struct RunWorkColl { + template + struct Scatterer { + struct ncclDevWorkColl* work; + int chunkCount; + ssize_t railGridOffset; + + template + __device__ __forceinline__ void operator()( + int tid, int tn, int slice, int maxSliceSize, + int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag + ) { + static_assert(SlicePerChunk == 1, "require: SlicePerChunk==1"); + static_assert(MaxDsts <= 1 || MaxSrcs <= 1, "require: MaxDsts<=1 || MaxSrcs<=1"); + + struct ncclNvls* nvls = &ncclShmem.channel.nvls; + int nNodes = ncclShmem.comm.nNodes; + int nRails = nvls->nHeads; + int part = ncclShmem.channelId - work->channelLo; + void* inbuf = (void*)work->sendbuff; + ssize_t countPerRank = work->collnet.count; + + ssize_t railAllBeg = min(railGridOffset + part * chunkCount, nNodes * countPerRank); + ssize_t railAllEnd = min(railAllBeg + chunkCount, nNodes * countPerRank); + int railAllSize = railAllEnd - railAllBeg; + int rail = nvls->headRank; + int dst = 0; + if (ReduceSendNotRecv) { + if (work->regUsed) return; + rail = 0; + nSrcs = 1; + } else { + rail = nvls->headRank; + } + if (tid < nDsts) dstSizes[tid] = railAllSize; + do { + int node = railAllBeg / countPerRank; + int railAllOffset = 0; + while (railAllOffset < railAllSize) { + ssize_t railOneBeg = node * countPerRank; + ssize_t railOneEnd = railOneBeg + countPerRank; + ssize_t railOneOffset = (railAllBeg + railAllOffset) - railOneBeg; + int delta = min(railAllEnd, railOneEnd) - (railAllBeg + railAllOffset); + int rank = ncclShmem.comm.collNetDenseToUserRank[node * nRails + rail]; + ssize_t userOneBeg = rank * countPerRank + railOneOffset; + if (nDsts != 0) { + reduceCopy + (tid, tn, work->redOpArg, &work->redOpArg, false, + /*nSrcs=*/nSrcs, [=]__device__(int s) { + return work->regUsed ? (T*)srcPtrs[s] + userOneBeg : + !ReduceSendNotRecv ? (T*)srcPtrs[s] + railAllOffset: + (T*)inbuf + userOneBeg; + }, + /*nDsts=*/1, [=]__device__(int d/*==0*/) { + return (T*)dstPtrs[dst] + railAllOffset; + }, delta); + } + railAllOffset += delta; + node += 1; + } + dst += 1; + rail += 1; + } while (ReduceSendNotRecv && dst < nRails); + } + }; + __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) { struct ncclNvls* nvls = &ncclShmem.channel.nvls; - size_t count; - size_t gridOffset; - size_t channelCount; - size_t chunkCount; - ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount); - const int rank = ncclShmem.comm.rank; - const int nranks = ncclShmem.comm.nRanks; - size_t offset; int nelem; /* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync; * if not, based on #ranks, we allocate 7 or 5 warps to reduce to saturate bandwidth * and the rest are allocated to scatter. */ - const int nThreadsReduce = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE); - const int nThreadsScatter = work->regUsed ? WARP_SIZE : (NCCL_MAX_NTHREADS - nThreadsReduce); - const int tidEndScatter = nThreadsScatter; + const int nThreadsNetRecv = work->oneNode ? 0 : (work->netRegUsed ? WARP_SIZE : 6 * WARP_SIZE); + const int nThreadsScatter = work->regUsed ? roundUp(nvls->nHeads << 2, WARP_SIZE) : 8 * WARP_SIZE; + const int nThreadsReduce = NCCL_MAX_NTHREADS - nThreadsNetRecv - nThreadsScatter; + const int tidEndNetRecv = nThreadsNetRecv; + const int tidEndScatter = tidEndNetRecv + nThreadsScatter; const int tidEndReduce = tidEndScatter + nThreadsReduce; - if (!work->regUsed) { - if (tid < tidEndScatter) { - // Scatter - using Proto = ProtoSimple<1, 1, COLL_UNROLL>; - Primitives, /*Direct=*/0, Proto, 0> - prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL, - work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); - for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { - offset = gridOffset + elemOffset; - nelem = min(chunkCount, channelCount - elemOffset); - prims.scatter(offset, nvls->nHeads * count, nelem, count, -1, 0); + if (work->oneNode) { + const int rank = ncclShmem.comm.rank; + size_t offset; + size_t count, gridOffset, channelCount, chunkCount; + ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount); + if (!work->regUsed) { + if (tid < tidEndScatter) { + // Scatter + using Proto = ProtoSimple<1, 1, COLL_UNROLL>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL, + work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); + for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { + offset = gridOffset + elemOffset; + nelem = min(chunkCount, channelCount - elemOffset); + prims.scatter(offset, nvls->nHeads * count, nelem, count, -1, 0); + } + // coverity[overrun-call] => Coverity think prims.index can be greater than 1 + } else if (tid < tidEndReduce) { + // Reduce through NVLS + using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, work->recvbuff, + work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0); + for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { + offset = gridOffset + elemOffset; + nelem = min(chunkCount, channelCount - elemOffset); + prims.recv(offset, nelem); + } } - // coverity[overrun-call] => Coverity think prims.index can be greater than 1 - } else if (tid < tidEndReduce) { - // Reduce through NVLS - using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>; - Primitives, /*Direct=*/0, Proto, 0> - prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, work->recvbuff, - work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0); - for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { - offset = gridOffset + elemOffset; - nelem = min(chunkCount, channelCount - elemOffset); - prims.recv(offset, nelem); + } else { + if (tid < tidEndScatter) { + // Scatter + using Proto = ProtoSimple<1, 1, COLL_UNROLL>; + Primitives, /*Direct=*/0, Proto, 0> + prims(tid, nThreadsScatter, nvls->up, nvls->up, NULL, NULL, + work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); + for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { + prims.scatter(0, 0, 0, 0, -1, 0); + } + + /* gather used as sync */ + prims.gather(0, 0, 0, 0, -1, 0); + } else if (tid < tidEndReduce) { + // Reduce through NVLS + using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>; + Primitives, /*Direct=*/1, Proto, 0> + prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, work->recvbuff, + work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work); + for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { + size_t outOffset = gridOffset + elemOffset; + size_t inpOffset = outOffset + rank * count; + nelem = min(chunkCount, channelCount - elemOffset); + // Coverity complains about a possible overrun inside the method invoked below, but that's actually + // a false positive. + // coverity[overrun-call:FALSE] + prims.directRecvCopy(inpOffset, outOffset, nelem); + } + + /* send for sync */ + prims.send(0, 0); } } } else { - if (tid < tidEndScatter) { - // Scatter + // multi-node + int nNodes = ncclShmem.comm.nNodes; + int part = ncclShmem.channelId - work->channelLo; + ssize_t countPerRank = work->collnet.count; + const int nChannels = work->channelHi - work->channelLo + 1; + ssize_t chunkCount = work->collnet.chunkCount; + if (tid < tidEndNetRecv) { using Proto = ProtoSimple<1, 1, COLL_UNROLL>; - Primitives, /*Direct=*/0, Proto, 0> - prims(tid, nThreadsScatter, nvls->up, nvls->up, NULL, NULL, - work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1); - for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { - prims.scatter(0, 0, 0, 0, -1, 0); + if (work->netRegUsed) { + if (tid == 0) { + int steps = (int)divUp(nNodes * countPerRank, nChannels * chunkCount); + Primitives, /*Direct=*/0, Proto, 0>::recvPeerNotify(nvls->out, 0, steps); + } + __syncwarp(); + } else { + Primitives, /*Direct=*/0, Proto, 0> + prims(tid, nThreadsNetRecv, &nvls->out, nullptr, nullptr, work->recvbuff, + work->redOpArg, 0 * Proto::MaxGroupWidth, 0, 0); + for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) { + ssize_t railAllBeg = railGridOffset + part * chunkCount; + ssize_t railAllEnd = min(railAllBeg + chunkCount, nNodes * countPerRank); + ssize_t railOneBeg = ncclShmem.comm.node * countPerRank; + ssize_t railOneEnd = railOneBeg + countPerRank; + ssize_t beg = max(railAllBeg, railOneBeg); + ssize_t end = min(railAllEnd, railOneEnd); + prims.recv(beg - railOneBeg, max(ssize_t(0), end - beg), /*postOp=*/true); + } } - - /* gather used as sync */ - prims.gather(0, 0, 0, 0, -1, 0); - } else if (tid < tidEndReduce) { - // Reduce through NVLS - using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>; - Primitives, /*Direct=*/1, Proto, 0> - prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, work->recvbuff, - work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work); - for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { - size_t outOffset = gridOffset + elemOffset; - size_t inpOffset = outOffset + rank * count; - nelem = min(chunkCount, channelCount - elemOffset); - // Coverity complains about a possible overrun inside the method invoked below, but that's actually - // a false positive. - // coverity[overrun-call:FALSE] - prims.directRecvCopy(inpOffset, outOffset, nelem); + } else { + if (tid < tidEndScatter) { + using Proto = ProtoSimple<1, 1, COLL_UNROLL>; + Primitives, /*Direct=*/1, Proto, 0> + prims(tid - tidEndNetRecv, nThreadsScatter, nullptr, nvls->up, work->sendbuff, nullptr, + work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1, work); + for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) { + Scatterer scat; + scat.work = work; + scat.chunkCount = chunkCount; + scat.railGridOffset = railGridOffset; + prims.template process(scat); + } + } else if (tid < tidEndReduce) { + using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>; + Primitives, /*Direct=*/1, Proto, 0> + prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->out, nullptr, nullptr, + work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work); + for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) { + Scatterer scat; + scat.work = work; + scat.chunkCount = chunkCount; + scat.railGridOffset = railGridOffset; + prims.template process(scat); + } } - - /* send for sync */ - prims.send(0, 0); } } } @@ -231,7 +355,7 @@ struct RunWorkColl + template __device__ __forceinline__ void operator()( int tid, int tn, int slice, int maxSliceSize, int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag diff --git a/src/device/symmetric/all_gather.cuh b/src/device/symmetric/all_gather.cuh new file mode 100644 index 000000000..8f81347ec --- /dev/null +++ b/src/device/symmetric/all_gather.cuh @@ -0,0 +1,367 @@ +#include "symmetric.h" +#include "symmetric/kernel.cuh" +#include "symmetric/primitives.cuh" + +template +static __device__ void bcastDeep( + ncclSymPrims& prim, int tn, int t, bool waitNeeded, + char* inputHere, char* outputRank0, bool inPlace, int nIters + ) { + using Pack = BytePack; + int wn = tn/WARP_SIZE; + int w = t/WARP_SIZE; + int lane = t%WARP_SIZE; + int const& rank = prim.rank; + int const& nRanks = prim.nRanks; + uint32_t const& stride4G = prim.stride4G; + Pack* inpHere = (Pack*)inputHere + intptr_t(w)*UnrollPacks*WARP_SIZE + lane; + Pack* outRank0 = (Pack*)outputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane; + Pack tmp[UnrollPacks]; + + nIters -= w; + if (0 < nIters) { + #pragma unroll + for (int u=0; u < UnrollPacks; u++) { + tmp[u] = inpHere[u*WARP_SIZE]; + } + } + + if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + + if (0 < nIters) { + while (true) { + int dr = inPlace ? 1 : 0; + int r = rank + dr; + if (r == nRanks) r = 0; + #pragma unroll 2 + for (int partial=0; partial <= 1; partial++) { + #pragma unroll 1 + for (int i = 0; + partial ? i < 1 : (dr + UnrollPeers <= nRanks); + partial ? i++ : (dr += UnrollPeers)) { + #pragma unroll + for (int ur=0; ur < UnrollPeers-partial; ur++) { + if (partial && dr == nRanks) break; + #pragma unroll UnrollPacks + for (int u=0; u < UnrollPacks; u++) { + add4G(outRank0, r*stride4G)[u*WARP_SIZE] = tmp[u]; + } + if (++r == nRanks) r = 0; + } + } + } + inpHere += intptr_t(wn)*UnrollPacks*WARP_SIZE; + outRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE; + nIters -= wn; + if (nIters <= 0) break; + + // Load data for next iteration. + #pragma unroll + for (int u=0; u < UnrollPacks; u++) { + tmp[u] = inpHere[u*WARP_SIZE]; + } + } + } +} + +template +static __device__ void bcastEnds( + ncclSymPrims& prim, int tn, int t, + T* inputHere, T* outputRank0, bool inPlace, size_t nElts, uint32_t nPreElts, size_t nSufElts + ) { + int const& rank = prim.rank; + int const& nRanks = prim.nRanks; + uint32_t const& stride4G = prim.stride4G; + BytePack* inpHere = (BytePack*)inputHere; + BytePack* outRank0 = (BytePack*)outputRank0; + #pragma unroll 1 + for (size_t i = t; i < nPreElts+nSufElts; i += tn) { + size_t elt = i < nPreElts ? i : nElts-nPreElts-nSufElts+i; + BytePack tmp = inpHere[elt]; + int dr = inPlace ? 1 : 0; + int r = rank + dr; + if (r == nRanks) r = 0; + #pragma unroll 1 + for (; dr + UnrollPeers <= nRanks; dr += UnrollPeers) { + #pragma unroll UnrollPeers + for (int u=0; u < UnrollPeers; u++) { + *add4G(outRank0+elt, r*stride4G) = tmp; + if (++r == nRanks) r = 0; + } + } + #pragma unroll UnrollPeers + for (int u=0; u < UnrollPeers; u++) { + if (dr+u == nRanks) break; + *add4G(outRank0+elt, r*stride4G) = tmp; + if (++r == nRanks) r = 0; + } + } +} + +template +static __device__ void bcast( + ncclSymPrims& prim, int tn, int t, bool waitNeeded, T* input, T* output, size_t nElts + ) { + bool inPlace = (input == output); + // Mpve to rank=0 + output = prim.peerPtr(0, output); + + uintptr_t inputUptr = reinterpret_cast(input); + uintptr_t outputUptr = reinterpret_cast(output); + size_t nBytes = nElts*sizeof(T); + + uint32_t nPreBytes = (128u - inputUptr)%128u; + nPreBytes = min((size_t)nPreBytes, nBytes); + uintptr_t cursor = nPreBytes; + + constexpr int MinWarpPerBlock = 4; + + if ((inputUptr-outputUptr)%16 == 0) { + constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2; + constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack; + uint32_t chunks = (nBytes-cursor)/BytePerChunk; + chunks -= imodFast32(chunks, prim.nBlocks, prim.nBlocks_rcp32); + if (chunks != 0) { + uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk; + bcastDeep( + prim, tn, t, waitNeeded, + (char*)input + cursor, (char*)output + cursor, inPlace, + chunks*MinWarpPerBlock + ); + cursor = cursorAfter; + waitNeeded = false; + } + } + + if (sizeof(T) == 4 || (sizeof(T) < 4 && (inputUptr-outputUptr)%4 == 0)) { + constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4; + constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack; + uint32_t chunks = (nBytes-cursor)/BytePerChunk; + chunks -= imodFast32(chunks, prim.nBlocks, prim.nBlocks_rcp32); + if (chunks != 0) { + uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk; + bcastDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers>( + prim, tn, t, waitNeeded, + (char*)input + cursor, (char*)output + cursor, inPlace, + chunks*MinWarpPerBlock + ); + cursor = cursorAfter; + waitNeeded = false; + } + } + + if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + + constexpr int UnrollPeers = 8; + size_t nSufElts = (nBytes-cursor)/sizeof(T); + bcastEnds(prim, tn, t, input, output, inPlace, nElts, nPreBytes/sizeof(T), nSufElts); +} + +__device__ __forceinline__ void ncclSymRun_AllGather_ST(ncclSymDevArgs const* args) { + ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier); + int const& rank = prim.rank; + + // Threads numbered over rank. + int bt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE, + prim.block, prim.nBlocks, + threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); + int btn = prim.nBlocks*blockDim.x; + + prim.barrierArrive(ncclCoopCta(), /*release=*/false); + //prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + + bcast(prim, btn, bt, /*waitNeeded=*/true, (char*)args->input, (char*)args->output + rank*args->nElts, args->nElts); + + prim.barrierArrive(ncclCoopCta(), /*release=*/true); + prim.barrierWait(ncclCoopCta(), /*acquire=*/false); +} + + +template +static __device__ void bcastMultimem( + ncclSymPrims& prim, int tn, int t, T* input, T* output, size_t nElts + ) { + // Move output to multimem + output = prim.multimemPtr(output); + + uintptr_t inputUptr = reinterpret_cast(input); + uintptr_t outputUptr = reinterpret_cast(output); + size_t nBytes = nElts*sizeof(T); + + uint32_t nPreBytes = (16-inputUptr)%16; + nPreBytes = min((size_t)nPreBytes, nBytes); + uintptr_t nSufBytes; + + if ((inputUptr-outputUptr)%16 == 0) { + constexpr int BytePerPack = 16, UnrollPacks = 8; + constexpr int BytePerChunk = UnrollPacks*WARP_SIZE*BytePerPack; + uintptr_t cursor = nPreBytes; + uint32_t nChunks = (nBytes-cursor)/BytePerChunk; + uintptr_t cursorAfter = cursor + uintptr_t(nChunks)*BytePerChunk; + nSufBytes = nBytes - cursorAfter; + cursor += (t/WARP_SIZE)*UnrollPacks*WARP_SIZE*BytePerPack; + cursor += (t%WARP_SIZE)*BytePerPack; + int nIters = nChunks - t/WARP_SIZE; + #pragma unroll 1 + while (0 < nIters) { + BytePack tmp[UnrollPacks]; + #pragma unroll + for (int u=0; u < UnrollPacks; u++) { + tmp[u] = *reinterpret_cast*>(inputUptr + cursor + u*WARP_SIZE*BytePerPack); + } + #pragma unroll + for (int u=0; u < UnrollPacks; u++) { + multimem_st_global(outputUptr + cursor + u*WARP_SIZE*BytePerPack, tmp[u]); + } + cursor += tn*UnrollPacks*BytePerPack; + nIters -= tn/WARP_SIZE; + } + } else { + nPreBytes = 0; + nSufBytes = nBytes; + } + + // Get the prefix+suffix element one at a time. + #pragma unroll 4 + for (uintptr_t i = t*sizeof(T); i < nPreBytes + nSufBytes; i += tn*sizeof(T)) { + uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes); + BytePack val = *reinterpret_cast*>(inputUptr + cursor); + multimem_st_global(outputUptr + cursor, val); + cursor += tn*sizeof(T); + } +} + +__device__ __forceinline__ void ncclSymRun_AllGather_STMC(ncclSymDevArgs const* args) { + ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem); + int const& rank = prim.rank; + + char* input = args->input; + char* output = args->output; + size_t bytes = args->nElts; + // Round robin memory to blocks. + int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE, + prim.block, prim.nBlocks, + threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); + int tn = prim.nBlocks*blockDim.x; + + prim.barrierArrive(ncclCoopCta(), /*release=*/false); + prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + + bcastMultimem(prim, tn, t, input, output + rank*bytes, bytes); + + prim.barrierArrive(ncclCoopCta(), /*release=*/true); + prim.barrierWait(ncclCoopCta(), /*acquire=*/false); +} + +template +static __device__ void allgather_LL_body( + ncclSymPrims &prim, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts + ) { + using Pack = BytePack<8>; + constexpr int EltPerPack = 8/sizeof(EltType); + + ncclCoopCta cta; + int rank = prim.rank; + int nRanks = prim.nRanks; + constexpr int tn = ncclSymMaxThreads; + int t = threadIdx.x; + + #pragma unroll 1 + while (0 < nElts) { + int nIterPacks = min(nPacks, tn); + if (t < nIterPacks) { + Pack x = loadPack(input, t*EltPerPack, nElts); + prim.bcastLL(/*slot=*/nIterPacks*rank + t, x); + } + + int tn_div_nPacks = tn/nIterPacks; + int tn_mod_nPacks = tn%nIterPacks; + int peer = t/nIterPacks; + int pack = t%nIterPacks; + #if 1 + // NOTE: Unrolling speedup on eos nranks=8 size=64K: 5.7us vs 6.7us + constexpr int Unroll = 4; + #pragma unroll 1 + for (int i = t; i < (nRanks*nIterPacks & -(Unroll*tn)); i += Unroll*tn) { + Pack got[Unroll]; + prim.template recvLL(i, Unroll, tn, /*&*/got); + #pragma unroll + for (int u=0; u < Unroll; u++) { + storePack(output + peer*nStrideElts, pack*EltPerPack, nElts, got[u]); + peer += tn_div_nPacks; + pack += tn_mod_nPacks; + if (nIterPacks <= pack) { peer += 1; pack -= nIterPacks; } + } + } + + int i = (nRanks*nIterPacks & -(Unroll*tn)) + t; + int n = (nRanks*nIterPacks)/tn % Unroll; + if (i + n*tn < nRanks*nIterPacks) n += 1; + if (n != 0) { + Pack got[Unroll]; + prim.template recvLL<1, Unroll>(i, n, tn, /*&*/got); + #pragma unroll + for (int u=0; u < Unroll; u++) { + if (u != 0 && u == n) break; + storePack(output + peer*nStrideElts, pack*EltPerPack, nElts, got[u]); + peer += tn_div_nPacks; + pack += tn_mod_nPacks; + if (nIterPacks <= pack) { peer += 1; pack -= nIterPacks; } + } + } + #else + // The non-unrolled but "obviously correct" implementation for reference. + #pragma unroll 1 + for (int i = t; i < nRanks*nIterPacks; i += tn) { + Pack got = prim.template recvLL(i); + storePack(output + peer*nStrideElts, pack*EltPerPack, nElts, got); + peer += tn_div_nPacks; + pack += tn_mod_nPacks; + if (nIterPacks <= pack) { peer += 1; pack -= nIterPacks; } + } + #endif + + prim.endLL(cta); + + input += tn*EltPerPack; + output += tn*EltPerPack; + nElts -= tn*EltPerPack; + nPacks -= tn; + } +} + +static __device__ void ncclSymRun_AllGather_LL_impl(ncclSymDevArgs const* args, bool multimem) { + ncclSymPrims prim(args->comm, ncclSymPrims_UseLL | multimem*ncclSymPrims_UseMultimem); + using Pack = BytePack<8>; + constexpr int BytePerPack = 8; + int nElts = args->nElts; + int nPacks = divUp(nElts, BytePerPack); + + uint32_t nPackPerBlock, nPackModBlock; + idivmodFast32(&nPackPerBlock, &nPackModBlock, nPacks, prim.nBlocks, prim.nBlocks_rcp32); + int blockPackBegin = prim.block*nPackPerBlock + minval(prim.block, nPackModBlock); + int blockPackEnd = blockPackBegin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0); + int nBlockPacks = blockPackEnd - blockPackBegin; + int nBlockElts = nElts - blockPackBegin*BytePerPack; + nBlockElts = min(nBlockElts, nBlockPacks*BytePerPack); + char* blockInput = args->input + blockPackBegin*BytePerPack; + char* blockOutput = args->output + blockPackBegin*BytePerPack; + + uint32_t lowBits = args->nElts; + lowBits |= (uint32_t)reinterpret_cast(args->input); + lowBits |= (uint32_t)reinterpret_cast(args->output); + if (__builtin_expect(lowBits%8 == 0, true)) { + // NOTE: Specializing for 8-byte alignment in one case help at size=65K: 8.9us vs 5.6us + allgather_LL_body(prim, (BytePack<8>*)blockInput, (BytePack<8>*)blockOutput, nBlockElts/8, nBlockPacks, nElts/8); + } else { + allgather_LL_body(prim, blockInput, blockOutput, nBlockElts, nBlockPacks, nElts); + } +} + +__device__ __forceinline__ void ncclSymRun_AllGather_LL(ncclSymDevArgs const* args) { + ncclSymRun_AllGather_LL_impl(args, /*multimem=*/false); +} + +__device__ __forceinline__ void ncclSymRun_AllGather_LLMC(ncclSymDevArgs const* args) { + ncclSymRun_AllGather_LL_impl(args, /*multimem=*/true); +} diff --git a/src/device/symmetric/all_reduce.cuh b/src/device/symmetric/all_reduce.cuh new file mode 100644 index 000000000..6c5219784 --- /dev/null +++ b/src/device/symmetric/all_reduce.cuh @@ -0,0 +1,432 @@ +#include "symmetric.h" +#include "symmetric/kernel.cuh" +#include "symmetric/primitives.cuh" + +template +static __device__ __forceinline__ void allreduceDeep( + ncclSymPrims& prim, int tn, int t, bool waitNeeded, + Red red, char* inputRank0, char* outputRank0, int32_t nIters + ) { + using Pack = BytePack; + using Acc = typename Red::EltType; + using AccPack = BytePack; + + int wn = tn/WARP_SIZE; + int w = t/WARP_SIZE; + int lane = t%WARP_SIZE; + int const& rank = prim.rank; + int const& nRanks = prim.nRanks; + uint32_t const& stride4G = prim.stride4G; + Pack* inpRank0 = (Pack*)inputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane; + Pack* outRank0 = (Pack*)outputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane; + Pack acc0[UnrollPacks]; + + nIters -= w; + if (0 < nIters) { + #pragma unroll + for (int u=0; u < UnrollPacks; u++) { + acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE]; + } + } + + if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + + if (0 < nIters) { + while (true) { + AccPack acc1[UnrollPacks]; + int r = rank; + if (++r == nRanks) r = 0; + { Pack tmp1[UnrollPacks]; + #pragma unroll + for (int u=0; u < UnrollPacks; u++) { + tmp1[u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE]; + } + #pragma unroll + for (int u=0; u < UnrollPacks; u++) { + acc1[u] = applyReduce(red, applyCast(acc0[u]), applyCast(tmp1[u])); + } + } + + if (++r == nRanks) r = 0; + + int dr = 2; + #pragma unroll 2 + for (int partial=0; partial <= 1; partial++) { + #pragma unroll 1 + for (int i = 0; + partial ? i < 1 : (dr + UnrollPeers <= nRanks); + partial ? i++ : (dr += UnrollPeers)) { + if (partial && dr == nRanks) break; + + Pack tmp1[UnrollPeers][UnrollPacks]; + #pragma unroll + for (int ur=0; ur < UnrollPeers-partial; ur++) { + if (partial && ur!=0 && dr+ur == nRanks) break; + #pragma unroll UnrollPacks + for (int u=0; u < UnrollPacks; u++) { + tmp1[ur][u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE]; + } + if (++r == nRanks) r = 0; + } + #pragma unroll + for (int ur=0; ur < UnrollPeers-partial; ur++) { + if (partial && ur!=0 && dr+ur == nRanks) break; + #pragma unroll UnrollPacks + for (int u=0; u < UnrollPacks; u++) { + acc1[u] = applyReduce(red, acc1[u], applyCast(tmp1[ur][u])); + } + } + } + } + + #pragma unroll + for (int u=0; u < UnrollPacks; u++) acc0[u] = applyCast(acc1[u]); + + dr = 0; + r = rank; + #pragma unroll 2 + for (int partial=0; partial <= 1; partial++) { + #pragma unroll 1 + for (int i = 0; + partial ? i < 1 : (dr + UnrollPeers <= nRanks); + partial ? i++ : (dr += UnrollPeers)) { + #pragma unroll + for (int ur=0; ur < UnrollPeers-partial; ur++) { + if (partial && dr == nRanks) break; + #pragma unroll UnrollPacks + for (int u=0; u < UnrollPacks; u++) { + add4G(outRank0, r*stride4G)[u*WARP_SIZE] = acc0[u]; + } + if (++r == nRanks) r = 0; + } + } + } + + inpRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE; + outRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE; + nIters -= wn; + if (nIters <= 0) break; + + // Load data for next iteration. + #pragma unroll + for (int u=0; u < UnrollPacks; u++) { + acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE]; + } + } + } +} + +template +static __device__ __forceinline__ void allreduceEnds( + ncclSymPrims& prim, int tn, int t, Red red, + T* inputRank0, T* outputRank0, size_t nElts, uint32_t nPreElts, size_t nSufElts + ) { + using Acc = typename Red::EltType; + + int const& rank = prim.rank; + int const& nRanks = prim.nRanks; + uint32_t const& stride4G = prim.stride4G; + BytePack* inpRank0 = (BytePack*)inputRank0; + BytePack* outRank0 = (BytePack*)outputRank0; + + #pragma unroll 1 + for (size_t i = t; i < nPreElts+nSufElts; i += tn) { + size_t elt = i < nPreElts ? i : nElts-nSufElts-nPreElts+i; + BytePack acc0 = *add4G(inpRank0+elt, rank*stride4G); + BytePack acc1; + BytePack tmp[UnrollPeers]; + int dr = 1; + int r = rank+1; + if (nRanks == r) r = 0; + bool first = true; + + #pragma unroll 2 + for (int partial=0; partial <= 1; partial++) { + #pragma unroll 1 + for (int j = 0; + partial ? j < 1 : (dr + UnrollPeers <= nRanks); + partial ? j++ : (dr += UnrollPeers)) { + if (partial && dr == nRanks) break; + + #pragma unroll + for (int u=0; u < UnrollPeers-partial; u++) { + if (partial && u!=0 && dr+u == nRanks) break; + tmp[u] = *add4G(inpRank0+elt, r*stride4G); + r += 1; + if (r == nRanks) r = 0; + } + if (first) { + first = false; + acc1 = applyCast(acc0); + } + #pragma unroll + for (int u=0; u < UnrollPeers-partial; u++) { + if (partial && u!=0 && dr+u == nRanks) break; + acc1 = applyReduce(red, acc1, applyCast(tmp[u])); + } + } + } + + acc0 = applyCast(acc1); + dr = 0; + r = rank; + #pragma unroll 2 + for (int partial=0; partial <= 1; partial++) { + #pragma unroll 1 + for (int j=0; + partial ? j < 1 : (dr + UnrollPeers <= nRanks); + partial ? j++ : (dr += UnrollPeers)) { + #pragma unroll + for (int u=0; u < UnrollPeers-partial; u++) { + if (partial && dr+u == nRanks) break; + *add4G(outRank0+elt, r*stride4G) = acc0; + r += 1; + if (r == nRanks) r = 0; + } + } + } + } +} + +template +static __device__ void allreduce( + ncclSymPrims& prim, int tn, int t, bool waitNeeded, + Red red, T* input, T* output, size_t nElts + ) { + int nRanks = prim.nRanks; + int nBlocks = prim.nBlocks; + // Mpve to rank=0 + input = prim.peerPtr(0, input); + output = prim.peerPtr(0, output); + + uintptr_t inputUptr = reinterpret_cast(input); + uintptr_t outputUptr = reinterpret_cast(output); + size_t nBytes = nElts*sizeof(T); + + uint32_t nPreBytes = (16u - inputUptr)%16u; + nPreBytes = min((size_t)nPreBytes, nBytes); + uintptr_t cursor = nPreBytes; + + constexpr int MinWarpPerBlock = 4; + + if ((inputUptr-outputUptr)%16 == 0) { + constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2; + constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack; + uint32_t chunks = (nBytes-cursor)/BytePerChunk; + chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32); + if (chunks != 0) { + uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk; + allreduceDeep( + prim, tn, t, waitNeeded, red, + (char*)input + cursor, (char*)output + cursor, + chunks*MinWarpPerBlock + ); + cursor = cursorAfter; + waitNeeded = false; + } + } + + if (sizeof(T) == 4 || (sizeof(T) < 4 && (inputUptr-outputUptr)%4 == 0)) { + constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4; + constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack; + uint32_t chunks = (nBytes-cursor)/BytePerChunk; + chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32); + if (chunks != 0) { + uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk; + allreduceDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers, T>( + prim, tn, t, waitNeeded, red, + (char*)input + cursor, (char*)output + cursor, + chunks*MinWarpPerBlock + ); + cursor = cursorAfter; + waitNeeded = false; + } + } + + if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + + constexpr int UnrollPeers = 8; + size_t nSufElts = (nBytes-cursor)/sizeof(T); + allreduceEnds(prim, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts); +} + + +template typename Red, typename T> +__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLD_AGxST(ncclSymDevArgs const* args) { + ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier); + int /*const&*/ rank = prim.rank; + int /*const&*/ nRanks = prim.nRanks; + Red::Type> red(args->redOpArg); + + // Threads numbered globally such that we round robin warps by rank then block. + int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE, + rank, nRanks, + prim.block, prim.nBlocks, + threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); + int gtn = nRanks*prim.nBlocks*blockDim.x; + + prim.barrierArrive(ncclCoopCta(), /*release=*/false); + //prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + + allreduce(prim, gtn, gt, /*waitNeeded=*/true, red, (T*)args->input, (T*)args->output, args->nElts); + + prim.barrierArrive(ncclCoopCta(), /*release=*/true); + prim.barrierWait(ncclCoopCta(), /*acquire=*/false); +} + + +template +static __device__ void allreduceMultimem( + ncclSymPrims& prim, int tn, int t, Red red, T* input, T* output, size_t nElts + ) { + // Mpve to multimem + input = prim.multimemPtr(input); + output = prim.multimemPtr(output); + + uintptr_t inputUptr = reinterpret_cast(input); + uintptr_t outputUptr = reinterpret_cast(output); + size_t nBytes = nElts*sizeof(T); + + constexpr int BytePerPack = LoadMultimem_BigPackSize::BigPackSize; + uint32_t nPreBytes = (BytePerPack - inputUptr)%BytePerPack; + nPreBytes = min((size_t)nPreBytes, nBytes); + uintptr_t nSufBytes; + + if (alignof(T) == BytePerPack || (inputUptr-outputUptr)%BytePerPack == 0) { + constexpr int UnrollPacks = 16*8/BytePerPack; + constexpr int BytePerChunk = UnrollPacks*WARP_SIZE*BytePerPack; + uintptr_t cursor = nPreBytes; + int nChunks = (nBytes-cursor)/BytePerChunk; + uintptr_t cursorAfter = cursor + uintptr_t(nChunks)*BytePerChunk; + nSufBytes = nBytes - cursorAfter; + cursor += (t/WARP_SIZE)*UnrollPacks*WARP_SIZE*BytePerPack; + cursor += (t%WARP_SIZE)*BytePerPack; + int nIters = nChunks - t/WARP_SIZE; + #pragma unroll 1 + while (0 < nIters) { + BytePack tmp[UnrollPacks]; + #pragma unroll + for (int u=0; u < UnrollPacks; u++) { + tmp[u] = applyLoadMultimem(red, inputUptr + cursor + u*WARP_SIZE*BytePerPack); + } + #pragma unroll + for (int u=0; u < UnrollPacks; u++) { + multimem_st_global(outputUptr + cursor + u*WARP_SIZE*BytePerPack, tmp[u]); + } + cursor += tn*UnrollPacks*BytePerPack; + nIters -= tn/WARP_SIZE; + } + } else { + nPreBytes = 0; + nSufBytes = nBytes; + } + + // Get the prefix+suffix element one at a time. + #pragma unroll 4 + for (uintptr_t i = t*sizeof(T); i < nPreBytes + nSufBytes; i += tn*sizeof(T)) { + uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes); + BytePack val = applyLoadMultimem(red, inputUptr + cursor); + multimem_st_global(outputUptr + cursor, val); + cursor += tn*sizeof(T); + } +} + +template typename Red, typename T> +__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLDMC_AGxSTMC(ncclSymDevArgs const* args) { + ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem); + Red::Type> red(args->redOpArg); + + // Threads numbered globally such that we round robin warps by rank then block. + int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE, + prim.rank, prim.nRanks, + prim.block, prim.nBlocks, + threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); + int gtn = prim.nRanks*prim.nBlocks*blockDim.x; + + prim.barrierArrive(ncclCoopCta(), /*release=*/false); + prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + + allreduceMultimem(prim, gtn, gt, red, (T*)args->input, (T*)args->output, args->nElts); + + prim.barrierArrive(ncclCoopCta(), /*release=*/true); + prim.barrierWait(ncclCoopCta(), /*acquire=*/false); +} + +template typename Red, typename T> +__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R_impl(ncclSymDevArgs const* args, bool multimem) { + ncclSymPrims prim(args->comm, ncclSymPrims_UseLL | multimem*ncclSymPrims_UseMultimem); + int /*const&*/ rank = prim.rank; + using Acc = typename ncclSymAccumType::Type; + Red red(args->redOpArg); + + using Pack = BytePack<8>; + using AccPack = BytePack<8*sizeof(Acc)/sizeof(T)>; + constexpr int EltPerPack = 8/sizeof(T); + int nElts = args->nElts; + int nPacks = divUp(nElts, EltPerPack); + + bool packAligned = 8 <= alignof(T) || ( + args->nElts*sizeof(T) | + (uint32_t)reinterpret_cast(args->input) | + (uint32_t)reinterpret_cast(args->output) + )%8 == 0; + + uint32_t nPackPerBlock, nPackModBlock; + idivmodFast32(&nPackPerBlock, &nPackModBlock, nPacks, prim.nBlocks, prim.nBlocks_rcp32); + int begin = prim.block*nPackPerBlock + minval(prim.block, nPackModBlock); + int end = begin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0); + + nPacks = end - begin; + nElts -= begin*EltPerPack; + nElts = min(nElts, nPacks*EltPerPack); + T* input = (T*)args->input + begin*EltPerPack; + T* output = (T*)args->output + begin*EltPerPack; + + ncclCoopCta cta; + int t = threadIdx.x; + int tn = ncclSymMaxThreads; + + if (__builtin_expect(packAligned, true)) { + #pragma unroll 1 + while (0 < nPacks) { + if (t < nPacks) { + int nIterPacks = min(nPacks, tn); + Pack inp = loadPack((Pack*)input, t, nPacks); + prim.bcastLL(/*slot=*/nIterPacks*rank + t, inp); + Pack out = prim.template recvReduceLL(t, nIterPacks, red); + storePack((Pack*)output, t, nPacks, out); + } + prim.endLL(cta); + + input += tn*EltPerPack; + output += tn*EltPerPack; + nPacks -= tn; + } + } else { + #pragma unroll 1 + while (0 < nElts) { + if (t*EltPerPack < nElts) { + int nIterPacks = min(nPacks, tn); + Pack inp = loadPack(input, t*EltPerPack, nElts); + prim.bcastLL(/*slot=*/nIterPacks*rank + t, inp); + Pack out = prim.template recvReduceLL(t, nIterPacks, red); + storePack(output, t*EltPerPack, nElts, out); + } + prim.endLL(cta); + + input += tn*EltPerPack; + output += tn*EltPerPack; + nElts -= tn*EltPerPack; + nPacks -= tn; + } + } +} + +template typename Red, typename T> +__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R(ncclSymDevArgs const* args) { + ncclSymRun_AllReduce_AGxLL_R_impl(args, /*multimem=*/false); +} +template typename Red, typename T> +__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLLMC_R(ncclSymDevArgs const* args) { + ncclSymRun_AllReduce_AGxLL_R_impl(args, /*multimem=*/true); +} diff --git a/src/device/symmetric/generate.py b/src/device/symmetric/generate.py new file mode 100755 index 000000000..f630ff072 --- /dev/null +++ b/src/device/symmetric/generate.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 +import os +import sys + +################################################################################ +# The first command line argument is the path to the directory to generate and +# populate. + +gensrc = sys.argv[1] + +if os.path.exists(gensrc): + for name in os.listdir(gensrc): + os.remove(os.path.join(gensrc, name)) + #os.truncate(os.path.join(gensrc, name), 0) +else: + os.mkdir(gensrc) + +def paste(sep, *args): + return sep.join(args) + +indents = 0 +def emitln(f, lines): + global indents + for ln in ((lines,) if isinstance(lines, str) else lines): + f.write(' '*indents + ln + '\n') + +def indent(s): + return '\n'.join(' '+l for l in s.splitlines()) + +class Rec(object): + def __init__(me, **kw): + me.__dict__.update(kw) + def __eq__(x, y): + if len(x) != len(y): return False + for k in x: + if k not in y: return False + if x[k] != y[k]: return False + return True + def __hash__(me): + h = 0 + for k in me.__dict__: + h += hash((k, me.__dict__[k])) + return h + +################################################################################ +# Edit this region for introducing new algos etc + +reductions = ["AllReduce","ReduceScatter"] +all_reds = ["sum"] +all_tys = ["f32","f16","bf16","f8e4m3","f8e5m2"] + +nvls_algos_by_coll = { + "AllReduce": ["AGxLLMC_R","RSxLDMC_AGxSTMC"], + "ReduceScatter": ["LDMC"] +} +ldmc_algos = ["RSxLDMC_AGxSTMC", "LDMC"] + +coll_to_lower = { + "AllGather": "all_gather", + "AllReduce": "all_reduce", + "ReduceScatter": "reduce_scatter" +} + +red_to_ncclDevRedOp = { + "sum": "ncclDevSum" +} +red_to_Func = { + "sum": "FuncSum" +} + +ty_to_ncclDataType = { + "f32": "ncclFloat32", + "f16": "ncclFloat16", + "bf16": "ncclBfloat16", + "f8e4m3": "ncclFloat8e4m3", + "f8e5m2": "ncclFloat8e5m2" +} +ty_to_cxxtype = { + "f32": "float", + "f16": "half", + "bf16": "__nv_bfloat16", + "f8e4m3": "__nv_fp8_e4m3", + "f8e5m2": "__nv_fp8_e5m2" +} + +def enumerate_kernels(): + for algo in ["LL","LLMC","ST","STMC"]: + yield Rec(coll="AllGather", algo=algo) + for red in all_reds: + for ty in all_tys: + for algo in ["AGxLL_R","AGxLLMC_R","RSxLD_AGxST","RSxLDMC_AGxSTMC"]: + yield Rec(coll="AllReduce", algo=algo, red=red, ty=ty) + for algo in ["LL","LD","LDMC"]: + yield Rec(coll="ReduceScatter", algo=algo, red=red, ty=ty) + +def required_cuda(k): + cudart, arch, specific_sms = 0, 0, None + is_nvls = k.algo in nvls_algos_by_coll.get(k.coll, []) + if is_nvls: + cudart = max(cudart, 12010) + arch = 900 + if k.coll in reductions: + if k.ty == "bf16": + cudart = max(cudart, 11000) + if k.ty.startswith("f8"): + cudart = max(cudart, 11080) + arch = 900 + if k.algo in ldmc_algos: + cudart = 12070 + arch = None + specific_sms = [100, 120] + return (cudart, arch, specific_sms) + +################################################################################ + +def kernel_fdep(k): + return coll_to_lower[k.coll] + '.cu' + +def kernel_fname(k): + if k.coll in reductions: + if k.algo in ldmc_algos and k.ty.startswith('f8'): + return paste('_', coll_to_lower[k.coll], k.red, k.ty, k.algo) + '.cu' + else: + return paste('_', coll_to_lower[k.coll], k.red, k.ty) + '.cu' + else: + return coll_to_lower[k.coll] + '.cu' + +def kernel_gencode(k): + if k.coll in reductions and k.algo in ldmc_algos and k.ty.startswith('f8'): + return "$(NVCC_GENCODE_LDMC_FP8)" + else: + return "$(NVCC_GENCODE)" + +def kernel_cname(k): + if k.coll in reductions: + return paste("_", "ncclSymDevKernel", k.coll, k.algo, k.red, k.ty) + else: + return paste("_", "ncclSymDevKernel", k.coll, k.algo) + +def kernel_conds(k): + cudart, arch, specific_sms = required_cuda(k) + if cudart == 0: return (None, None) + + cudart_cond = "CUDART_VERSION >= %d"%cudart + if not specific_sms: + arch_cond = "__CUDA_ARCH__ >= %d"%arch + else: + arch_cond = " || ".join(["0"] + ["NCCL_CUDA_ARCH_SPECIFIC==%d"%(10*sm) for sm in specific_sms]) + return cudart_cond, arch_cond + +def instantiate(k): + cudart_cond, arch_cond = kernel_conds(k) + if (cudart_cond, arch_cond) == (None, None): + form_red_ty = ( + "__global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const args) {{\n" + " ncclSymRun_{id}<{red}, {ty}>(&args);\n" + "}}" + ) + form = ( + "__global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const args) {{\n" + " ncclSymRun_{id}(&args);\n" + "}}" + ) + else: + form_red_ty = ( + "#if {cudart_cond}\n" + " __global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const args) {{\n" + " #if {arch_cond}\n" + " ncclSymRun_{id}<{red}, {ty}>(&args);\n" + " #endif\n" + " }}\n" + "#endif" + ) + form = ( + "#if {cudart_cond}\n" + " __global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const args) {{\n" + " #if {arch_cond}\n" + " ncclSymRun_{id}(&args);\n" + " #endif\n" + " }}\n" + "#endif" + ) + + id = k.coll+'_'+k.algo + cname = kernel_cname(k) + if k.coll in reductions: + inst = form_red_ty.format(cname=cname, id=id, red=red_to_Func[k.red], ty=ty_to_cxxtype[k.ty], cudart_cond=cudart_cond, arch_cond=arch_cond) + else: + inst = form.format(cname=cname, id=id, cudart_cond=cudart_cond, arch_cond=arch_cond) + return inst + +def prototype(k): + cudart_cond, arch_cond = kernel_conds(k) + if cudart_cond is None: + form = "__global__ void {cname}(ncclSymDevArgs const);" + else: + form = ( + "#if {cudart_cond}\n" + " __global__ void {cname}(ncclSymDevArgs const);\n" + "#else\n" + " constexpr void* {cname} = nullptr;\n" + "#endif" + ) + return form.format(cname=kernel_cname(k), cudart_cond=cudart_cond) + +################################################################################ + +def partition(vals, keyfn): + ans = {} + for x in vals: + k = keyfn(x) + if k not in ans: + ans[k] = [] + ans[k].append(x) + return ans + + +kernels_by_file = partition(enumerate_kernels(), lambda k: (kernel_fname(k), k.coll)) + +# Add dependency only files (e.g. allreduce.cu) +for coll in set(k.coll for k in enumerate_kernels()): + fname = coll_to_lower[coll]+'.cu' + if (fname, coll) not in kernels_by_file: + kernels_by_file[fname, coll] = [] + +# Generate each kernel instantiation file +for (fname, coll), ks in kernels_by_file.items(): + with open(os.path.join(gensrc, fname), "w") as f: + emitln(f, '#include "symmetric.h"') + emitln(f, '#include "symmetric/kernel.cuh"') + emitln(f, '#include "symmetric/{coll}.cuh"'.format(coll=coll_to_lower[coll])) + for k in ks: + emitln(f, instantiate(k)) + +# Generate /symmetric_host.cc +with open(os.path.join(gensrc, "symmetric_kernels.cc"), "w") as f: + emitln(f, '#include "symmetric.h"') + emitln(f, '#include "device.h"') + emitln(f, '') + + for k in enumerate_kernels(): + emitln(f, prototype(k)) + emitln(f, '') + + emitln(f, 'extern int const ncclSymKernelCount = %d;' % len(list(enumerate_kernels()))) + emitln(f, 'extern void* const ncclSymKernelList[] = {') + for k in enumerate_kernels(): + emitln(f, '(void*){cname},'.format(cname=kernel_cname(k))) + emitln(f, 'nullptr};') + emitln(f, '') + + emitln(f, 'void* ncclSymGetKernelPtr(ncclSymKernelId id, int red, ncclDataType_t ty) {') + indents += 1 + emitln(f, 'switch (id) {') + emitln(f, 'default: return nullptr;') + for (coll, algo), coll_algo_ks in partition(enumerate_kernels(), lambda k: (k.coll, k.algo)).items(): + emitln(f, 'case ncclSymKernelId_'+coll+'_'+algo+':') + indents += 1 + if len(coll_algo_ks) == 1: + emitln(f, 'return (void*)&'+kernel_cname(coll_algo_ks[0])+';') + else: + emitln(f, 'switch ((ncclDevRedOp_t)red) {') + emitln(f, 'default: return nullptr;') + for red, coll_algo_red_ks in partition(coll_algo_ks, lambda k: k.red).items(): + emitln(f, 'case '+red_to_ncclDevRedOp[red]+':') + indents += 1 + emitln(f, 'switch (ty) {') + emitln(f, 'default: return nullptr;') + for k in coll_algo_red_ks: + emitln(f, 'case '+ty_to_ncclDataType[k.ty]+': return (void*)'+kernel_cname(k)+';') + emitln(f, '}') + indents -= 1 + emitln(f, '}') + indents -=1 + emitln(f, '}') + indents -= 1 + emitln(f, '}') + +# Generate /rules.mk +with open(os.path.join(gensrc, "rules.mk"), "w") as f: + inst_names = sorted(set(kernel_fname(k) for k in enumerate_kernels())) + names = inst_names + ["symmetric_kernels.cc"] + f.write("LIB_OBJS_SYM_GEN = $(patsubst %,$(OBJDIR)/genobj/symmetric/%.o,{names})\n" + .format(names=" ".join(names))) + f.write("\n") + + inst_names = sorted(set((k.coll, kernel_fname(k), kernel_gencode(k)) for k in enumerate_kernels())) + for coll, name, gencode in inst_names: + f.write( + "$(OBJDIR)/genobj/symmetric/{name}.o: $(OBJDIR)/gensrc/symmetric $(OBJDIR)/genobj/symmetric/{coll}.cu.d\n" + "\t" "$(call COMPILE_SYM,$@,$(OBJDIR)/gensrc/symmetric/{name},{gencode})\n" + "\n" + .format(name=name, coll=coll_to_lower[coll], gencode=gencode) + ) diff --git a/src/device/symmetric/kernel.cuh b/src/device/symmetric/kernel.cuh new file mode 100644 index 000000000..f631d51d9 --- /dev/null +++ b/src/device/symmetric/kernel.cuh @@ -0,0 +1,27 @@ +#ifndef NCCL_DEVICE_SYMMETRIC_KERNEL_H_ +#define NCCL_DEVICE_SYMMETRIC_KERNEL_H_ + +#include "symmetric.h" + +template typename Red, typename T> +__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R(struct ncclSymDevArgs const* args); +template typename Red, typename T> +__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLLMC_R(struct ncclSymDevArgs const* args); + +template typename Red, typename T> +__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLD_AGxST(struct ncclSymDevArgs const* args); +template typename Red, typename T> +__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLDMC_AGxSTMC(struct ncclSymDevArgs const* args); + +__device__ __forceinline__ void ncclSymRun_AllGather_LL(struct ncclSymDevArgs const* args); +__device__ __forceinline__ void ncclSymRun_AllGather_LLMC(struct ncclSymDevArgs const* args); +__device__ __forceinline__ void ncclSymRun_AllGather_ST(struct ncclSymDevArgs const* args); +__device__ __forceinline__ void ncclSymRun_AllGather_STMC(struct ncclSymDevArgs const* args); + +template typename Red, typename T> +__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL(struct ncclSymDevArgs const* args); +template typename Red, typename T> +__device__ __forceinline__ void ncclSymRun_ReduceScatter_LD(struct ncclSymDevArgs const* args); +template typename Red, typename T> +__device__ __forceinline__ void ncclSymRun_ReduceScatter_LDMC(struct ncclSymDevArgs const* args); +#endif diff --git a/src/device/symmetric/primitives.cuh b/src/device/symmetric/primitives.cuh new file mode 100644 index 000000000..167024400 --- /dev/null +++ b/src/device/symmetric/primitives.cuh @@ -0,0 +1,420 @@ +#ifndef NCCL_DEVICE_SYMMETRIC_PRIMITIVES_H_ +#define NCCL_DEVICE_SYMMETRIC_PRIMITIVES_H_ + +#include "symmetric.h" +#include "bitops.h" +#include "collectives.h" +#include "op128.h" +#include "reduce_kernel.h" + +#if __CUDA_ARCH__ >= 700 +// __grid_constant__ appears to break cuda-gdb +#define NCCL_GRID_CONSTANT __grid_constant__ +#else +#define NCCL_GRID_CONSTANT +#endif + +// flattenIx(pos0, dim0, pos1, dim1, pos2, dim2, ...) +// Given a position vector `pos` in a rectangular index space with lengths in the `dim` +// vector, flatten that down to a linear index. The fastest moving dimension is given first. +__device__ __forceinline__ int flattenIx() { return 0; } + +template +static __device__ Int0 flattenIx(Int0 pos, Int1 size, Ints ...more) { + return pos + size*flattenIx(more...); +} + +// Precomputed integer reciprocoals for denominator values 1..64 inclusive. +// Pass these to idivFast64() for fast division on the GPU. +static __device__ uint64_t idivRcp64_upto64(int x) { + static constexpr uint64_t table[65] = { + idivRcp64(0x01), idivRcp64(0x01), idivRcp64(0x02), idivRcp64(0x03), + idivRcp64(0x04), idivRcp64(0x05), idivRcp64(0x06), idivRcp64(0x07), + idivRcp64(0x08), idivRcp64(0x09), idivRcp64(0x0a), idivRcp64(0x0b), + idivRcp64(0x0c), idivRcp64(0x0d), idivRcp64(0x0e), idivRcp64(0x0f), + idivRcp64(0x10), idivRcp64(0x11), idivRcp64(0x12), idivRcp64(0x13), + idivRcp64(0x14), idivRcp64(0x15), idivRcp64(0x16), idivRcp64(0x17), + idivRcp64(0x18), idivRcp64(0x19), idivRcp64(0x1a), idivRcp64(0x1b), + idivRcp64(0x1c), idivRcp64(0x1d), idivRcp64(0x1e), idivRcp64(0x1f), + idivRcp64(0x20), idivRcp64(0x21), idivRcp64(0x22), idivRcp64(0x23), + idivRcp64(0x24), idivRcp64(0x25), idivRcp64(0x26), idivRcp64(0x27), + idivRcp64(0x28), idivRcp64(0x29), idivRcp64(0x2a), idivRcp64(0x2b), + idivRcp64(0x2c), idivRcp64(0x2d), idivRcp64(0x2e), idivRcp64(0x2f), + idivRcp64(0x30), idivRcp64(0x31), idivRcp64(0x32), idivRcp64(0x33), + idivRcp64(0x34), idivRcp64(0x35), idivRcp64(0x36), idivRcp64(0x37), + idivRcp64(0x38), idivRcp64(0x39), idivRcp64(0x3a), idivRcp64(0x3b), + idivRcp64(0x3c), idivRcp64(0x3d), idivRcp64(0x3e), idivRcp64(0x3f), + idivRcp64(0x40) + }; + return table[x]; +} + +static __device__ uint32_t idivRcp32_upto64(int x) { + return idivRcp64_upto64(x)>>32; +} + +namespace { +struct ncclCoopCta { + __device__ void sync() { __syncthreads(); } + __device__ int self() { return threadIdx.x; } + __device__ int count() { return blockDim.x; } +}; +struct ncclCoopWarps { + int log2_nWarps; + __device__ void sync() { + asm volatile("barrier.sync %0, %1;" :: "r"(1 + (threadIdx.x>>(5+log2_nWarps))), "r"(32<= 12030 && __CUDA_ARCH__ >= 900 + cudaGridDependencySynchronize(); + #endif + + if ((flags & ncclSymPrims_UseBarrier) && threadIdx.x < nRanks) { + barEpoch = (flags & ncclSymPrims_UseMultimem) ? base->barEpochMc[block] : base->barEpochUc[block]; + } + if (flags & ncclSymPrims_UseLL) llEpoch = base->llEpoch[block] + 2; + } + __device__ ~ncclSymPrims() { + if (threadIdx.x == 0) { + if (flags & ncclSymPrims_UseBarrier) { + ((flags & ncclSymPrims_UseMultimem) ? base->barEpochMc : base->barEpochUc)[block] = barEpoch; + } + if (flags & ncclSymPrims_UseLL) base->llEpoch[block] = llEpoch - 2; + } + } + + template + __device__ T* peerPtr(int peer, T* selfPtr) { + return add4G(selfPtr, (peer-rank)*stride4G); + } + + template + __device__ T* multimemPtr(T* selfPtr) { + return reinterpret_cast(reinterpret_cast(selfPtr) + offsetMc); + } + + __device__ void barrierArrive(ncclCoopCta cta, bool release) { + cta.sync(); + #if __CUDA_ARCH__ < 700 + if (release) { + if (cta.self() == 0) __threadfence_system(); + cta.sync(); + } + #endif + if (flags & ncclSymPrims_UseMultimem) { + #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010 + if (cta.self() == 0) { + uint32_t* inbox = &multimemPtr(base)->barInboxMc[block]; + if (release) { + asm volatile("multimem.red.release.sys.add.u32 [%0],1;" :: "l"(inbox)); + } else { + asm volatile("multimem.red.relaxed.sys.add.u32 [%0],1;" :: "l"(inbox)); + } + } + #endif + } else { + int r = cta.self(); + if (r != rank && r < nRanks) { + uint32_t* inbox = &peerPtr(r, base)->barInboxPerPeer[block*nRanks + rank]; + #if __CUDA_ARCH__ >= 700 + if (release) { + asm volatile("st.release.sys.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1)); + } else { + asm volatile("st.relaxed.sys.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1)); + } + #else + asm volatile("st.volatile.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1)); + #endif + } + } + } + + __device__ void barrierWait(ncclCoopCta cta, bool acquire) { + if (flags & ncclSymPrims_UseMultimem) { + #if __CUDA_ARCH__ >= 900 + if (cta.self() == 0) { + uint32_t* inbox = &base->barInboxMc[block]; + while (true) { + uint32_t got; + if (acquire) { + asm volatile("ld.acquire.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox)); + } else { + asm volatile("ld.relaxed.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox)); + } + if (got-(barEpoch+nRanks) <= uint32_t(-1)>>1) break; + } + barEpoch += nRanks; + } + #endif + } else { + int r = cta.self(); + if (r != rank && r < nRanks) { + uint32_t* inbox = &base->barInboxPerPeer[block*nRanks + r]; + while (true) { + uint32_t got; + #if __CUDA_ARCH__ >= 700 + if (acquire) { + asm volatile("ld.acquire.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox)); + } else { + asm volatile("ld.relaxed.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox)); + } + #else + asm volatile("ld.volatile.u32 %0,[%1];" : "=r"(got) : "l"(inbox)); + #endif + if (got-(barEpoch+1) <= uint32_t(-1)>>1) break; + } + } + #if __CUDA_ARCH__ < 700 + if (acquire) { + cta.sync(); + if (cta.self() == 0) __threadfence(); + } + #endif + barEpoch += 1; + } + cta.sync(); + } + + __device__ void endLL(ncclCoopCta cta) { + if (__builtin_expect(llEpoch >= -2u, false)) { + cta.sync(); + uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch); + int epochSize = ncclSymLLEpochSize(nRanks); + #pragma unroll 4 + for (int i=cta.self(); i*16 < epochSize; i += cta.count()) { + buf[i] = uint4{0, 0, 0, 0}; + } + } + cta.sync(); + llEpoch += (llEpoch == -1u) ? 3 : 1; + } + + template + __device__ void sendLL(int peer, int slot, T val) { + union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; }; + tmp = val; + uint4* buf = ncclSymDevBase_getLLBuf(peerPtr(peer, base), nRanks, block, llEpoch) + slot; + #pragma unroll + for (int u=0; u < divUp(sizeof(T),8); u++) { + asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch)); + } + } + + template + __device__ void bcastLL(int slot, T val) { + if (flags & ncclSymPrims_UseMultimem) { + union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; }; + tmp = val; + uint4* bufmc = ncclSymDevBase_getLLBuf(multimemPtr(base), nRanks, block, llEpoch) + slot; + #pragma unroll + for (int u=0; u < divUp(sizeof(T),8); u++) { + asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(bufmc + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch)); + } + } else { + union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; }; + tmp = val; + uint4* buf0 = ncclSymDevBase_getLLBuf(peerPtr(0, base), nRanks, block, llEpoch) + slot; + int dr = 0; + int r = rank; + #pragma unroll 1 + for (; dr+8 <= nRanks; dr += 8) { + #pragma unroll + for (int ur=0; ur < 8; ur++) { + uint4* buf = add4G(buf0, r*stride4G); + #pragma unroll + for (int u=0; u < divUp(sizeof(T),8); u++) { + asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch)); + } + r += 1; + if (r == nRanks) r = 0; + } + } + #pragma unroll + for (int ur=0; ur < 8; ur++, dr++) { + if (dr == nRanks) break; + uint4* buf = add4G(buf0, r*stride4G); + #pragma unroll + for (int u=0; u < divUp(sizeof(T),8); u++) { + asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch)); + } + r += 1; + if (r == nRanks) r = 0; + } + } + } + + template + __device__ void recvLL(int slot0, int nSlots, int stride, T(&elts)[nSlotsMax]) { + uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch) + slot0; + uint4 tmp[nSlotsMax][divUp(sizeof(T),8)]; + //int spins=0; + while (true) { + #pragma unroll + for (int u=0; u < nSlotsMax; u++) { + if (u < nSlotsMin || u < nSlots) { + #pragma unroll + for (int v=0; v < divUp(sizeof(T),8); v++) { + asm volatile("ld.volatile.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(tmp[u][v].x), "=r"(tmp[u][v].y), "=r"(tmp[u][v].z), "=r"(tmp[u][v].w) : "l"(buf + u*stride + v*ncclSymLLMaxSlots(sizeof(T)))); + } + } + } + bool okAll = true; + #pragma unroll + for (int u=0; u < nSlotsMax; u++) { + #pragma unroll + for (int v=0; v < divUp(sizeof(T),8); v++) { + if (u < nSlotsMin || u < nSlots) { + bool ok = tmp[u][v].y == llEpoch && + tmp[u][v].w == llEpoch; + okAll &= ok; + } + } + } + if (__builtin_expect(okAll, true)) break; + //if (spins++ == 10<<20) spins=0; + } + #pragma unroll + for (int u=0; u < nSlotsMax; u++) { + if (nSlotsMin <= u && u == nSlots) break; + union { T val; uint32_t u32[divUp(sizeof(T),8)][2]; }; + #pragma unroll + for (int v=0; v < divUp(sizeof(T),8); v++) { + u32[v][0] = tmp[u][v].x; + u32[v][1] = tmp[u][v].z; + } + elts[u] = val; + } + } + + template + __device__ Pack recvReduceLL(int slot, int stride, Red red) { + using Acc = typename Red::EltType; + using AccPack = BytePack; + AccPack acc; + bool first = true; + int r = 0; + #pragma unroll 1 + for (; r+Unroll <= nRanks; r += Unroll) { + Pack got[Unroll]; + this->template recvLL(slot + r*stride, Unroll, stride, got); + AccPack acc0 = applyCast(got[0]); + acc = first ? acc0 : applyReduce(red, acc, acc0); + first = false; + #pragma unroll + for (int i=1; i < Unroll; i++) acc = applyReduce(red, acc, applyCast(got[i])); + } + if (r < nRanks) { + Pack got[Unroll]; + this->template recvLL(slot + r*stride, nRanks-r, stride, got); + AccPack acc0 = applyCast(got[0]); + acc = first ? acc0 : applyReduce(red, acc, acc0); + #pragma unroll + for (int i=1; i < Unroll-1; i++) { + if (r+i < nRanks) acc = applyReduce(red, acc, applyCast(got[i])); + } + } + return applyCast(acc); + } + + template + __device__ T recvLL(int slot) { + T one[1]; + this->template recvLL<1, 1, T>(slot, 1, 0, one); + return one[0]; + } + + template + __device__ void coopRecvLL(Coop coop, int slot0, int nSlots, T* dst) { + int me = coop.self(); + if (me < nSlots) { + uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch) + slot0 + me; + uint4 got[divUp(sizeof(T), 8)]; + //int spins=0; + #pragma unroll 1 + while (true) { + #pragma unroll + for (int u=0; u < divUp(sizeof(T), 8); u++) { + asm volatile("ld.volatile.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(got[u].x), "=r"(got[u].y), "=r"(got[u].z), "=r"(got[u].w) : "l"(buf + u*ncclSymLLMaxSlots(sizeof(T)))); + } + bool ok = true; + #pragma unroll + for (int u=0; u < divUp(sizeof(T), 8); u++) { + ok &= got[u].y == llEpoch; + ok &= got[u].w == llEpoch; + } + if (__builtin_expect(ok, true)) break; + //if (++spins == 10<<20) { spins=0; printf("r=%d LL spin @ ix=%d got=%d want=%d\n", rank, slot0+me, got[0].y, llEpoch); } + } + union { T val; uint32_t u32[divUp(sizeof(T), 8)][2]; }; + #pragma unroll + for (int u=0; u < divUp(sizeof(T), 8); u++) { + u32[u][0] = got[u].x; + u32[u][1] = got[u].z; + } + dst[slot0 + me] = val; + } + } +}; +} + +template typename Red, typename T, bool nvls> +struct ncclSymAccumType { using Type = T; }; + +// Only Red's whose opArg is invariant w.r.t. the datatype can have a different +// accumulator type. At the moment this excludes integer min/max, sumpostdiv, +// and premulsum. +template<> struct ncclSymAccumType { using Type = float; }; +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> struct ncclSymAccumType { using Type = float; }; +#endif +#if defined(__CUDA_FP8_TYPES_EXIST__) +template<> struct ncclSymAccumType { using Type = float; }; +template<> struct ncclSymAccumType { using Type = float; }; +#endif +#endif diff --git a/src/device/symmetric/reduce_scatter.cuh b/src/device/symmetric/reduce_scatter.cuh new file mode 100644 index 000000000..4fd96093e --- /dev/null +++ b/src/device/symmetric/reduce_scatter.cuh @@ -0,0 +1,387 @@ +#include "symmetric.h" +#include "symmetric/kernel.cuh" +#include "symmetric/primitives.cuh" + +template +static __device__ void reduceDeep( + ncclSymPrims& prim, int tn, int t, bool waitNeeded, + Red red, char* inputRank0, char* outputHere, int32_t nIters + ) { + using Pack = BytePack; + using Acc = typename Red::EltType; + using AccPack = BytePack; + + int wn = tn/WARP_SIZE; + int w = t/WARP_SIZE; + int lane = t%WARP_SIZE; + int const& rank = prim.rank; + int const& nRanks = prim.nRanks; + uint32_t const& stride4G = prim.stride4G; + Pack* inpRank0 = (Pack*)inputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane; + Pack* outHere = (Pack*)outputHere + intptr_t(w)*UnrollPacks*WARP_SIZE + lane; + Pack acc0[UnrollPacks]; + + nIters -= w; + if (0 < nIters) { + #pragma unroll + for (int u=0; u < UnrollPacks; u++) { + acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE]; + } + } + + if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + + if (0 < nIters) { + while (true) { + AccPack acc1[UnrollPacks]; + int r = rank+1; + if (r == nRanks) r = 0; + { Pack tmp1[UnrollPacks]; + #pragma unroll + for (int u=0; u < UnrollPacks; u++) { + tmp1[u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE]; + } + #pragma unroll + for (int u=0; u < UnrollPacks; u++) { + acc1[u] = applyReduce(red, applyCast(acc0[u]), applyCast(tmp1[u])); + } + } + + r += 1; + if (r == nRanks) r = 0; + + int dr = 2; + #pragma unroll 2 + for (int partial=0; partial <= 1; partial++) { + #pragma unroll 1 + for (int i = 0; + partial ? i < 1 : (dr + UnrollPeers <= nRanks); + partial ? i++ : (dr += UnrollPeers)) { + if (partial && dr == nRanks) break; + + Pack tmp1[UnrollPeers][UnrollPacks]; + #pragma unroll + for (int ur=0; ur < UnrollPeers-partial; ur++) { + if (partial && ur!=0 && dr+ur == nRanks) break; + #pragma unroll UnrollPacks + for (int u=0; u < UnrollPacks; u++) { + tmp1[ur][u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE]; + } + r += 1; + if (r == nRanks) r = 0; + } + #pragma unroll + for (int ur=0; ur < UnrollPeers-partial; ur++) { + if (partial && ur!=0 && dr+ur == nRanks) break; + #pragma unroll UnrollPacks + for (int u=0; u < UnrollPacks; u++) { + acc1[u] = applyReduce(red, acc1[u], applyCast(tmp1[ur][u])); + } + } + } + } + + #pragma unroll + for (int u=0; u < UnrollPacks; u++) acc0[u] = applyCast(acc1[u]); + + #pragma unroll UnrollPacks + for (int u=0; u < UnrollPacks; u++) outHere[u*WARP_SIZE] = acc0[u]; + + inpRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE; + outHere += intptr_t(wn)*UnrollPacks*WARP_SIZE; + nIters -= wn; + if (nIters <= 0) break; + + // Load data for next iteration. + #pragma unroll + for (int u=0; u < UnrollPacks; u++) { + acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE]; + } + } + } +} + +template +static __device__ void reduceEnds( + ncclSymPrims& prim, int tn, int t, Red red, + T* inputRank0, T* outputHere, size_t nElts, uint32_t nPreElts, size_t nSufElts + ) { + using Acc = typename Red::EltType; + + int const& rank = prim.rank; + int const& nRanks = prim.nRanks; + uint32_t const& stride4G = prim.stride4G; + BytePack* inpRank0 = (BytePack*)inputRank0; + BytePack* outHere = (BytePack*)outputHere; + #pragma unroll 1 + for (size_t i = t; i < nPreElts+nSufElts; i += tn) { + size_t elt = i < nPreElts ? i : nElts-nSufElts-nPreElts+i; + BytePack acc0 = *add4G(inpRank0+elt, rank*stride4G); + BytePack acc1; + BytePack tmp[UnrollPeers]; + int dr = 1; + int r = rank+1; + if (nRanks == r) r = 0; + bool first = true; + + #pragma unroll 2 + for (int partial=0; partial <= 1; partial++) { + #pragma unroll 1 + for (int j = 0; + partial ? j < 1 : (dr + UnrollPeers <= nRanks); + partial ? j++ : (dr += UnrollPeers)) { + if (partial && dr == nRanks) break; + + #pragma unroll + for (int u=0; u < UnrollPeers-partial; u++) { + if (partial && u!=0 && dr+u == nRanks) break; + tmp[u] = *add4G(inpRank0+elt, r*stride4G); + r += 1; + if (r == nRanks) r = 0; + } + if (first) { + first = false; + acc1 = applyCast(acc0); + } + #pragma unroll + for (int u=0; u < UnrollPeers-partial; u++) { + if (partial && u!=0 && dr+u == nRanks) break; + acc1 = applyReduce(red, acc1, applyCast(tmp[u])); + } + } + } + + acc0 = applyCast(acc1); + outHere[elt] = acc0; + } +} + +template +static __device__ void reduce( + ncclSymPrims& prim, int tn, int t, bool waitNeeded, + Red red, T* input, T* output, size_t nElts + ) { + int nRanks = prim.nRanks; + int nBlocks = prim.nBlocks; + // Mpve input to rank=0 + input = prim.peerPtr(0, input); + + uintptr_t inputUptr = reinterpret_cast(input); + uintptr_t outputUptr = reinterpret_cast(output); + uint32_t alignment = uint32_t(inputUptr - outputUptr); + size_t nBytes = nElts*sizeof(T); + + uint32_t nPreBytes = (16u - inputUptr)%16u; + nPreBytes = min((size_t)nPreBytes, nBytes); + uintptr_t cursor = nPreBytes; + + constexpr int MinWarpPerBlock = 4; + + if (alignment%16 == 0) { + constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2; + constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack; + uint32_t chunks = (nBytes-cursor)/BytePerChunk; + chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32); + if (chunks != 0) { + uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk; + reduceDeep( + prim, tn, t, waitNeeded, red, + (char*)input + cursor, (char*)output + cursor, + chunks*MinWarpPerBlock + ); + cursor = cursorAfter; + waitNeeded = false; + } + } + + if (sizeof(T) == 4 || (sizeof(T) < 4 && alignment%4 == 0)) { + constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4; + constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack; + uint32_t chunks = (nBytes-cursor)/BytePerChunk; + chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32); + if (chunks != 0) { + uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk; + reduceDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers, T>( + prim, tn, t, waitNeeded, red, + (char*)input + cursor, (char*)output + cursor, + chunks*MinWarpPerBlock + ); + cursor = cursorAfter; + waitNeeded = false; + } + } + + if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + + constexpr int UnrollPeers = 8; + size_t nSufElts = (nBytes-cursor)/sizeof(T); + reduceEnds(prim, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts); +} + + +template typename Red, typename T> +__device__ __forceinline__ void ncclSymRun_ReduceScatter_LD(ncclSymDevArgs const* args) { + ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier); + Red::Type> red(args->redOpArg); + + // Round robin warps over blocks. + int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE, + prim.block, prim.nBlocks, + threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); + int tn = prim.nBlocks*blockDim.x; + + prim.barrierArrive(ncclCoopCta(), /*release=*/false); + //prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + + reduce(prim, tn, t, /*waitNeeded=*/true, red, (T*)args->input + prim.rank*args->nElts, (T*)args->output, args->nElts); + + prim.barrierArrive(ncclCoopCta(), /*release=*/false); + prim.barrierWait(ncclCoopCta(), /*acquire=*/false); +} + + +template +static __device__ void reduceMultimem( + ncclSymPrims& prim, int tn, int t, Red red, T* input, T* output, size_t nElts + ) { + // Mpve input to multimem + input = prim.multimemPtr(input); + + uintptr_t inputUptr = reinterpret_cast(input); + uintptr_t outputUptr = reinterpret_cast(output); + size_t nBytes = nElts*sizeof(T); + + constexpr int BytePerPack = LoadMultimem_BigPackSize::BigPackSize; + uint32_t nPreBytes = (BytePerPack - inputUptr)%BytePerPack; + nPreBytes = min((size_t)nPreBytes, nBytes); + uintptr_t nSufBytes; + + if (sizeof(T) == BytePerPack || (inputUptr-outputUptr)%BytePerPack == 0) { + constexpr int UnrollPacks = 8*(16/BytePerPack); + constexpr int BytePerChunk = UnrollPacks*WARP_SIZE*BytePerPack; + uintptr_t cursor = nPreBytes; + uint32_t nChunks = (nBytes-cursor)/BytePerChunk; + uintptr_t cursorAfter = cursor + uintptr_t(nChunks)*BytePerChunk; + nSufBytes = nBytes - cursorAfter; + cursor += (t/WARP_SIZE)*UnrollPacks*WARP_SIZE*BytePerPack; + cursor += (t%WARP_SIZE)*BytePerPack; + int nIters = nChunks - t/WARP_SIZE; + #pragma unroll 1 + while (0 < nIters) { + BytePack tmp[UnrollPacks]; + #pragma unroll + for (int u=0; u < UnrollPacks; u++) { + tmp[u] = applyLoadMultimem(red, inputUptr + cursor + u*WARP_SIZE*BytePerPack); + } + #pragma unroll + for (int u=0; u < UnrollPacks; u++) { + *reinterpret_cast*>(outputUptr + cursor + u*WARP_SIZE*BytePerPack) = tmp[u]; + } + cursor += tn*UnrollPacks*BytePerPack; + nIters -= tn/WARP_SIZE; + } + } else { + nPreBytes = 0; + nSufBytes = nBytes; + } + + // Get the prefix+suffix element one at a time. + #pragma unroll 4 + for (uintptr_t i = t*sizeof(T); i < nPreBytes + nSufBytes; i += tn*sizeof(T)) { + uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes); + BytePack val = applyLoadMultimem(red, inputUptr + cursor); + *reinterpret_cast*>(outputUptr + cursor) = val; + cursor += tn*sizeof(T); + } +} + +template typename Red, typename T> +__device__ __forceinline__ void ncclSymRun_ReduceScatter_LDMC(ncclSymDevArgs const* args) { + ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem); + Red::Type> red(args->redOpArg); + + // Round robin warps over blocks. + int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE, + prim.block, prim.nBlocks, + threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); + int tn = prim.nBlocks*blockDim.x; + + prim.barrierArrive(ncclCoopCta(), /*release=*/false); + prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + + reduceMultimem(prim, tn, t, red, (T*)args->input + prim.rank*args->nElts, (T*)args->output, args->nElts); + + prim.barrierArrive(ncclCoopCta(), /*release=*/false); + prim.barrierWait(ncclCoopCta(), /*acquire=*/false); +} + +// T is user type, EltType is the most aligned type +template +__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL_body( + ncclSymPrims &prim, Red red, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts) { + using Pack = BytePack<8>; + constexpr int EltPerPack = 8/sizeof(EltType); + + int nRanks = prim.nRanks; + int rank = prim.rank; + int t = threadIdx.x; + int tn = ncclSymMaxThreads; + ncclCoopCta cta; + + #pragma unroll 1 + while (0 < nElts) { + int nIterPacks = min(nPacks, tn); + int tn_div_nPacks = tn/nIterPacks; + int tn_mod_nPacks = tn%nIterPacks; + int peer = t/nIterPacks; + int pack = t%nIterPacks; + + #pragma unroll 1 + for (int i = t; i < nRanks*nIterPacks; i += tn) { + Pack got = loadPack(input + peer*nStrideElts, pack*EltPerPack, nElts); + prim.sendLL(peer, rank*nIterPacks + pack, got); + peer += tn_div_nPacks; + pack += tn_mod_nPacks; + if (nIterPacks <= pack) { peer += 1; pack -= nIterPacks; } + } + + if (t < nIterPacks) { + Pack got = prim.template recvReduceLL(t, nIterPacks, red); + storePack(output, t*EltPerPack, nElts, got); + } + prim.endLL(cta); + + input += tn*EltPerPack; + output += tn*EltPerPack; + nElts -= tn*EltPerPack; + nPacks -= tn; + } +} +template typename Red, typename T> +__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL(ncclSymDevArgs const* args) { + ncclSymPrims prim(args->comm, ncclSymPrims_UseLL); + Red::Type> red(args->redOpArg); + + using Pack = BytePack<8>; + constexpr int EltPerPack = 8/sizeof(T); + int nAllElts = args->nElts; + int nAllPacks = divUp(nAllElts, EltPerPack); + uint32_t nPackPerBlock, nPackModBlock; + idivmodFast32(&nPackPerBlock, &nPackModBlock, nAllPacks, prim.nBlocks, prim.nBlocks_rcp32); + int blockPackBegin = prim.block*nPackPerBlock + minval(prim.block, nPackModBlock); + int blockPackEnd = blockPackBegin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0); + int nPacks = blockPackEnd - blockPackBegin; + int nElts = nAllElts - blockPackBegin*EltPerPack; + nElts = min(nElts, nPacks*EltPerPack); + T* input = (T*)args->input + blockPackBegin*EltPerPack; + T* output = (T*)args->output + blockPackBegin*EltPerPack; + + uint32_t lowBits = args->nElts*sizeof(T); + lowBits |= (uint32_t)reinterpret_cast(args->input); + lowBits |= (uint32_t)reinterpret_cast(args->output); + if (__builtin_expect(lowBits%8 == 0, true)) { + ncclSymRun_ReduceScatter_LL_body(prim, red, (Pack*)input, (Pack*)output, nPacks, nPacks, nAllElts/EltPerPack); + } else { + ncclSymRun_ReduceScatter_LL_body(prim, red, input, output, nElts, nPacks, nAllElts); + } +} diff --git a/src/enqueue.cc b/src/enqueue.cc index 4e8a211fc..f5b43724c 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -13,6 +13,7 @@ #include "cudawrap.h" #include "profiler.h" #include "transport.h" +#include "register_inline.h" #include // std::memcpy #include // PRIx64 @@ -28,34 +29,41 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma int carveout = ncclParamL1SharedMemoryCarveout(); int ncclMaxSharedMem = ncclShmemDynamicSize(cudaArch); - for (int k=0; k < ncclDevKernelCount; k++) { - void* fn = ncclDevKernelList[k]; - cudaFuncAttributes attr = {0}; - if (fn == nullptr) continue; - - CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0); - if (maxStackSize) { - if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes; - ignore0:; - } - if (carveout) { - CUDACHECKGOTO(cudaFuncSetAttribute(fn, - cudaFuncAttributePreferredSharedMemoryCarveout, carveout), - result, ignore1); - ignore1:; - } - if (ncclMaxSharedMem != 0) { - int sharedMemSize = ncclMaxSharedMem; - if (sharedMemSize > (maxSharedMem-attr.sharedSizeBytes)) { - WARN("cudaArch %d ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu", - cudaArch, sharedMemSize, maxSharedMem-attr.sharedSizeBytes); - return ncclSystemError; + for (int sym=0; sym <= 1; sym++) { + int kcount = sym==0 ? ncclDevKernelCount : ncclSymKernelCount; + void* const* kptrs = sym==0 ? ncclDevKernelList : ncclSymKernelList; + for (int k=0; k < kcount; k++) { + void* fn = kptrs[k]; + cudaFuncAttributes attr = {0}; + if (fn == nullptr) continue; + + cudaError_t errcode = cudaFuncGetAttributes(&attr, fn); + if (errcode == cudaErrorNoKernelImageForDevice) continue; + CUDACHECKGOTO(errcode, result, ignore0); + + if (maxStackSize) { + if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes; + ignore0:; } - CUDACHECKGOTO(cudaFuncSetAttribute(fn, - cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize), - result, next_kernel); + if (carveout) { + CUDACHECKGOTO(cudaFuncSetAttribute(fn, + cudaFuncAttributePreferredSharedMemoryCarveout, carveout), + result, ignore1); + ignore1:; + } + if (ncclMaxSharedMem != 0) { + int sharedMemSize = ncclMaxSharedMem; + if (sharedMemSize > (maxSharedMem-attr.sharedSizeBytes)) { + WARN("cudaArch %d ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu", + cudaArch, sharedMemSize, maxSharedMem-attr.sharedSizeBytes); + return ncclSystemError; + } + CUDACHECKGOTO(cudaFuncSetAttribute(fn, + cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize), + result, next_kernel); + } + next_kernel:; } - next_kernel:; } return result; } @@ -258,8 +266,8 @@ static bool testBudget( ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) { struct ncclKernelPlanner* planner = &comm->planner; + if (planner->isSymColl) return ncclSuccess; struct ncclTaskColl *task; - task = ncclIntruQueueHead(&planner->collTaskQueue); while (task != nullptr) { // Build a ncclDevWorkColl[Reg?] struct for each task. @@ -331,6 +339,38 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool int fnOpTyIndices[ncclNumFuncs*ncclNumDevRedOps*ncclNumTypes]; int fnOpTyCount = 0; + if (comm->nNodes == 1 && planner->nTasksColl == 1 && planner->nTasksP2p == 0) { + void* sendSymPtr; + void* recvSymPtr; + struct ncclReg* sendReg; + struct ncclReg* recvReg; + size_t size = task->count*ncclTypeSize(task->datatype); + NCCLCHECK(ncclRegFindSymmetric(comm, task->sendbuff, size, &sendSymPtr, &sendReg)); + NCCLCHECK(ncclRegFindSymmetric(comm, task->recvbuff, size, &recvSymPtr, &recvReg)); + bool implemented = ncclSymImplemented(task->func, task->opDev.op, task->datatype); + + if (sendReg && recvReg && (sendReg->winFlags & recvReg->winFlags & NCCL_WIN_COLL_SYMMETRIC) && implemented) { + enum ncclSymKernelId kernel; + int nChannels, nWarps; + float estTimeUs = 1.e18; + NCCLCHECK(ncclSymPickKernel(comm, task->func, task->opDev.op, task->datatype, task->count, &estTimeUs, &kernel, &nChannels, &nWarps)); + + // We should only use symmetric kernel if it beats the asymmetric kernel. But the + // perf model accuracy from asymmetric kernels is too inaccurate and reports too high + // of a bandwidth. For now just always use symmetric if available. + if (kernel != ncclSymKernelId_Count) { + task->sendbuff = sendSymPtr; + task->recvbuff = recvSymPtr; + task->devFuncId = (int)kernel; + task->nMaxChannels = nChannels; + task->nWarps = nWarps; + ncclIntruQueueEnqueue(&planner->collTaskQueue, task); + planner->isSymColl = true; + return ncclSuccess; + } + } + } + // Walk the size sorted tasks, binning them by (fn,op,ty). while (task != nullptr) { struct ncclTaskColl* next = task->next; @@ -603,6 +643,10 @@ static ncclResult_t scheduleCollTasksToPlan( (countHi != 0 ? countHi : countLo) -= cells*elementsPerCell - task->count; nChannels = (countLo!=0 ? 1 : 0) + nMidChannels + (cellsHi!=0 ? 1 : 0); + + // Update number of channels propagated to the profiler + task->nChannels = (uint8_t)nChannels; + // Ensure room for worst case of one new batch per channel if (!testBudget(budget, plan->nWorkBatches + nChannels, plan->workBytes + workNode->size)) { return ncclSuccess; @@ -860,6 +904,8 @@ static ncclResult_t addP2pToPlan( partSize = divUp(bytes[dir], nChannels[dir]); } } + // Update number of channels propagated to the profiler + if (p2pTasks[dir]) p2pTasks[dir]->nChannels = nChannels[dir]; } struct ncclWorkList* workNode = ncclMemoryStackAllocInlineArray(&comm->memScoped, 1); @@ -1052,47 +1098,17 @@ static ncclResult_t scheduleP2pTasksToPlan( } // Spin until its safe to increase comm->workFifoProduced to desiredProduced. -static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredProduced) { - bool hasRoom = (desiredProduced - comm->workFifoConsumedLeast) <= comm->workFifoBytes; - if (hasRoom) return; - while (true) { - // We have to poll for notifications from device. - uint32_t* consumedLive = comm->workFifoConsumed; - uint32_t consumed[MAXCHANNELS]; - for (int c=0; c < MAXCHANNELS; c++) { - consumed[c] = __atomic_load_n(&consumedLive[c], __ATOMIC_RELAXED); - } - // Compiler-only fence to prevent fusion of loops to encourage dense loads. - __atomic_signal_fence(__ATOMIC_SEQ_CST); - - uint32_t produced = comm->workFifoProduced; - uint32_t consumedLeast = produced; - for (int c=0; c < MAXCHANNELS; c++) { - // consumedLeast is min over all non-quiesced channels - if (consumed[c] != comm->channels[c].workFifoProduced) { - if ((produced - consumedLeast) < (produced - consumed[c])) { - consumedLeast = consumed[c]; - } - } - } - - // Compiler only fence to prevent fusion of loops to encourage dense stores. - __atomic_signal_fence(__ATOMIC_SEQ_CST); - - for (int c=0; c < MAXCHANNELS; c++) { - // Advance counter on quiesced channels so they don't lag behind - // too far where they could get lost in 32-bit wraparound. - if (consumed[c] == comm->channels[c].workFifoProduced) { - comm->channels[c].workFifoProduced = consumedLeast; - __atomic_store_n(&consumedLive[c], consumedLeast, __ATOMIC_RELAXED); - } +static ncclResult_t waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredProduced) { + bool hasRoom = (desiredProduced - comm->workFifoConsumed) <= comm->workFifoBytes; + if (!hasRoom) { + while (true) { + NCCLCHECK(ncclCommPollEventCallbacks(comm, /*waitSome=*/true)); + hasRoom = (desiredProduced - comm->workFifoConsumed) <= comm->workFifoBytes; + if (hasRoom) break; + sched_yield(); } - comm->workFifoConsumedLeast = consumedLeast; - - hasRoom = (desiredProduced - comm->workFifoConsumedLeast) <= comm->workFifoBytes; - if (hasRoom) break; - sched_yield(); } + return ncclSuccess; } namespace { @@ -1106,11 +1122,14 @@ namespace { struct uploadWork_cleanup_t* me = (struct uploadWork_cleanup_t*)cb; free(me->hostBuf); CUDACHECK(cudaEventDestroy(me->base.event)); + free(me); return ncclSuccess; } } static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* plan) { + if (plan->isSymColl) return ncclSuccess; + size_t workBytes = plan->workBytes; size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch); void* fifoBufHost; @@ -1127,7 +1146,7 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla fifoBufHost = comm->workFifoBuf; fifoCursor = comm->workFifoProduced; fifoMask = comm->workFifoBytes-1; - waitWorkFifoAvailable(comm, fifoCursor + workBytes); + NCCLCHECK(waitWorkFifoAvailable(comm, fifoCursor + workBytes)); plan->kernelArgs->workBuf = comm->workFifoBufDev; break; case ncclDevWorkStorageTypePersistent: @@ -1208,7 +1227,7 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla ncclIntruQueueEnqueue(&comm->eventCallbackQueue, (struct ncclCommEventCallback *)cleanup); NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), result, fail); - NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, fail); + NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm, /*waitSome=*/false), result, fail); finish_scope: if (mode != cudaStreamCaptureModeRelaxed) (void)cudaThreadExchangeStreamCaptureMode(&mode); @@ -1226,6 +1245,7 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan* uint64_t collOpCount = comm->sharedRes->collOpCount; uint64_t p2pOpBump[MAXCHANNELS] = {/*0...*/}; // Advance comm's collOpCount by number of colls in this plan. + int hasp2p = 0; comm->sharedRes->collOpCount += plan->collOpCount; comm->collOpCount += plan->collOpCount; @@ -1244,6 +1264,7 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan* // remember last value to compute max. p2pOpBump[op->channelId] = (oldId>>1) + 1; // +1 to ensure next plan doesn't collide op->opCount = (comm->sharedRes->p2pOpCount[op->channelId]<<1) + oldId; + hasp2p = 1; } else { // coll op->opCount = (collOpCount<<1) + oldId; } @@ -1253,9 +1274,11 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan* op = op->enqNext; } - for (int c=0; c < MAXCHANNELS; c++) { - // Advance channel's p2pOpCount by number of p2p's in this plan channel. - comm->sharedRes->p2pOpCount[c] += p2pOpBump[c]; + if (hasp2p) { + for (int c=0; c < MAXCHANNELS; c++) { + // Advance channel's p2pOpCount by number of p2p's in this plan channel. + comm->sharedRes->p2pOpCount[c] += p2pOpBump[c]; + } } return ncclSuccess; } @@ -1263,8 +1286,10 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan* static ncclResult_t hostStreamPlanTask(struct ncclComm* comm, struct ncclKernelPlan* plan) { NCCLCHECK(ncclProfilerStartGroupEvent(plan)); NCCLCHECK(ncclProfilerStartTaskEvents(plan)); - NCCLCHECK(uploadProxyOps(comm, plan)); - NCCLCHECK(ncclProxyStart(comm)); + if (ncclIntruQueueHead(&plan->proxyOpQueue)) { + NCCLCHECK(uploadProxyOps(comm, plan)); + NCCLCHECK(ncclProxyStart(comm)); + } NCCLCHECK(ncclProfilerStopTaskEvents(plan)); NCCLCHECK(ncclProfilerStopGroupEvent(plan)); if (!plan->persistent) { @@ -1281,7 +1306,6 @@ static void CUDART_CB hostStreamPlanCallback(void *plan_) { if (result != ncclSuccess) { WARN("hostStreamPlanCallback() failed : %s", ncclGetErrorString(result)); } - if (!plan->persistent) ncclAtomicRefCountDecrement(&plan->comm->sharedRes->noncapturedRefs); return; } @@ -1357,9 +1381,8 @@ namespace { static ncclResult_t getImplicitOrder(enum ncclImplicitOrder *mode, bool capturing, int driver=-1) { if (ncclParamLaunchOrderImplicit()) { - // Due to an unresolved bug in CUDA ncclImplicitOrderLaunch is not supported in graphs - if (capturing) { *mode = ncclImplicitOrderSerial; return ncclSuccess; } if (driver < 0) { NCCLCHECK(ncclCudaDriverVersion(&driver)); } + if (capturing && driver < 12090) { *mode = ncclImplicitOrderSerial; return ncclSuccess; } *mode = 12030 <= std::min(CUDART_VERSION, driver) ? ncclImplicitOrderLaunch : ncclImplicitOrderSerial; return ncclSuccess; } @@ -1386,26 +1409,51 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { plan->workStorageType = persistent ? ncclDevWorkStorageTypePersistent : ncclDevWorkStorageTypeFifo; - struct ncclKernelPlanBudget budget; - budget.inArgsBytes = comm->workArgsBytes - sizeof(struct ncclDevKernelArgs); - // Non-persistent kernels fill up at most half of our fifo per kernel. - budget.outArgsBytes = plan->persistent ? (1<<30) : comm->workFifoBytes/2; - - // Drain coll tasks first. This is essential since we partition tasks based - // on the work budget and p2p work isn't collective. If we were to drain p2p - // first, the place where we cut the kernel could vary by rank which would - // cause the "shortest channel first" channel picker to have divergent results. - if (planner->nTasksColl != 0) { - NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &budget), result, failure); - } - // And only drain p2p tasks once colls are depleted. - if (planner->nTasksColl == 0 && planner->nTasksP2p != 0) { - NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &budget), result, failure); - } - finishPlan(comm, plan); - if (plan->workBytes != 0) { + if (planner->isSymColl) { + plan->workStorageType = ncclDevWorkStorageTypeArgs; + + struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collTaskQueue); + plan->isSymColl = true; + plan->kernelFn = ncclSymGetKernelPtr((ncclSymKernelId)task->devFuncId, task->opDev.op, task->datatype); + plan->threadPerBlock = task->nWarps*WARP_SIZE; + plan->channelMask = uint64_t(-1) >> (64-task->nMaxChannels); + + plan->kernelArgsSize = sizeof(struct ncclSymDevArgs); + plan->kernelSymArgs = ncclMemoryStackAlloc(&comm->memScoped); + plan->kernelSymArgs->comm = comm->symDevComm; + plan->kernelSymArgs->rootRank = task->root; + plan->kernelSymArgs->redOpArg = task->opDev.scalarArg; + plan->kernelSymArgs->nElts = task->count; + plan->kernelSymArgs->input = (char*)task->sendbuff; + plan->kernelSymArgs->output = (char*)task->recvbuff; + + planner->nTasksColl -= 1; ncclIntruQueueEnqueue(&planner->planQueue, plan); + INFO(NCCL_TUNING, "%s [Symmetric]: %ld Bytes -> Kernel %s nchannels %d nthreads %d", + ncclFuncToString(task->func), task->count * ncclTypeSize(task->datatype), ncclSymKernelIdToString(task->devFuncId), task->nMaxChannels, plan->threadPerBlock); nPlans += 1; + } else { + struct ncclKernelPlanBudget budget; + budget.inArgsBytes = comm->workArgsBytes - sizeof(struct ncclDevKernelArgs); + // Non-persistent kernels fill up at most half of our fifo per kernel. + budget.outArgsBytes = plan->persistent ? (1<<30) : comm->workFifoBytes/2; + + // Drain coll tasks first. This is essential since we partition tasks based + // on the work budget and p2p work isn't collective. If we were to drain p2p + // first, the place where we cut the kernel could vary by rank which would + // cause the "shortest channel first" channel picker to have divergent results. + if (planner->nTasksColl != 0) { + NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &budget), result, failure); + } + // And only drain p2p tasks once colls are depleted. + if (planner->nTasksColl == 0 && planner->nTasksP2p != 0) { + NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &budget), result, failure); + } + finishPlan(comm, plan); + if (plan->workBytes != 0) { + ncclIntruQueueEnqueue(&planner->planQueue, plan); + nPlans += 1; + } } } while (planner->nTasksColl + planner->nTasksP2p != 0); @@ -1428,6 +1476,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { bool capturing = ncclCudaGraphValid(planner->capturingGraph); enum ncclImplicitOrder implicitOrder; + cudaError_t status = cudaSuccess; NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing), result, failure); if (implicitOrder != ncclImplicitOrderNone) { @@ -1439,7 +1488,8 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, launchOrder, comm->sharedRes->scratchEvent), result, failure); } - if (persistent || comm->sharedRes->persistentRefs != 0 || ncclCudaLaunchBlocking || __atomic_load_n(&comm->sharedRes->noncapturedRefs, __ATOMIC_ACQUIRE)) { + if (!persistent && comm->sharedRes->persistentRefs) status = cudaEventQuery(comm->sharedRes->hostStream.serialEvent); + if (persistent || ncclCudaLaunchBlocking || status == cudaErrorNotReady) { // We have to launch host tasks to push proxy args. We are careful to only // do this if necessary since host tasks impose a high performance cost in CUDA. bool acquired = false; @@ -1450,7 +1500,6 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { acquired = true; NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), result, failure); } - if (!persistent) ncclAtomicRefCountIncrement(&comm->sharedRes->noncapturedRefs); plan->isHostCbEnq = true; CUDACHECKGOTO(cudaLaunchHostFunc(hostStream, hostStreamPlanCallback, plan), result, failure); } @@ -1485,6 +1534,8 @@ ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, stru NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote); #endif +NCCL_PARAM(NvlinkUtilCentricSchedEnable, "NVLINK_UTIL_CENTRIC_SCHED_ENABLE", 0); + ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) { ncclResult_t ret = ncclSuccess; struct ncclKernelPlanner* planner = &comm->planner; @@ -1512,7 +1563,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan unsigned int clusterSize = (compCap >= 90) ? comm->config.cgaClusterSize : 0; CUlaunchConfig launchConfig = {0}; - CUlaunchAttribute launchAttrs[4] = {}; + CUlaunchAttribute launchAttrs[6] = {}; int attrs = 0; /* Cooperative Group Array (CGA) * On sm90 and later we have an extra level of hierarchy where we @@ -1549,6 +1600,18 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan launchAttrs[attrs].value.launchCompletionEvent.flags = 0; attrs++; } + if (comm->planner.isSymColl && compCap >= 90 && driverVersion >= 12030) { + launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION; + launchAttrs[attrs].value.programmaticStreamSerializationAllowed = 1; + attrs++; + } + #endif + #if CUDART_VERSION >= 13000 + if (compCap >= 90 && driverVersion >= 13000) { + launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING; + launchAttrs[attrs].value.nvlinkUtilCentricScheduling = ncclParamNvlinkUtilCentricSchedEnable(); + attrs++; + } #endif launchConfig.gridDimX = grid.x; launchConfig.gridDimY = grid.y; @@ -1560,7 +1623,6 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan launchConfig.attrs = launchAttrs; launchConfig.numAttrs = attrs; launchConfig.hStream = launchStream; - CUCHECKGOTO(cuLaunchKernelEx(&launchConfig, fn, nullptr, extra), ret, do_return); #endif } else { @@ -1573,21 +1635,30 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan } ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) { - if (!(plan->persistent || ncclCudaLaunchBlocking || plan->isHostCbEnq)) { - // We are not using the host stream for proxy ops and reclaimation submission. + if (!plan->isHostCbEnq) { + // we are not using the host stream for proxy ops and reclaimation submission, call + // hostStreamPlanTask directly NCCLCHECK(hostStreamPlanTask(comm, plan)); - } else { - // We are using the host stream for proxy ops and reclaimation submission. - // Only plans with proxy ops have a callback pushed by ncclLaunchPrepare. - // Since non-persistent plans also require reclaimation, we have to do it - // here. - if (!plan->persistent && !plan->hasProxyOps) { - ncclIntruQueueMpscEnqueue(&comm->callbackQueue, &plan->reclaimer); - } } return ncclSuccess; } +namespace { + struct KernelFinishCallback { + struct ncclCommEventCallback base; + uint32_t workFifoConsumed; + }; + ncclResult_t KernelFinishCallback_fn( + struct ncclComm* comm, struct ncclCommEventCallback* cb + ) { + struct KernelFinishCallback* me = (struct KernelFinishCallback*)cb; + comm->workFifoConsumed = me->workFifoConsumed; + CUDACHECK(cudaEventDestroy(me->base.event)); + free(me); + return ncclSuccess; + } +} + ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { struct ncclKernelPlanner* planner = &comm->planner; if (!ncclIntruQueueEmpty(&planner->planQueue)) { @@ -1597,7 +1668,21 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch cudaStream_t deviceStream, launchOrder; - CUDACHECK(cudaEventRecord(comm->sharedRes->scratchEvent, launchStream)); + cudaEvent_t finishedEvent = comm->sharedRes->scratchEvent; + CUDACHECK(cudaEventRecord(finishedEvent, launchStream)); + + if (comm->workFifoProduced - comm->workFifoProducedLastRecorded > comm->workFifoBytes/8) { + comm->workFifoProducedLastRecorded = comm->workFifoProduced; + struct KernelFinishCallback* cb; + NCCLCHECK(ncclCalloc(&cb, 1)); + cb->base.event = finishedEvent; + cb->base.fn = KernelFinishCallback_fn; + cb->workFifoConsumed = comm->workFifoProduced; + ncclIntruQueueEnqueue(&comm->eventCallbackQueue, &cb->base); + // We just stole scratchEvent so must create a new one. + CUDACHECK(cudaEventCreateWithFlags(&comm->sharedRes->scratchEvent, cudaEventDisableTiming)); + } + // deviceStream waits on userStream[0] NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream)); @@ -1606,13 +1691,13 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { // on launchStream as a fast-forward. When building CUDA graphs fast forwards should // be handled specially so as not to create graphs with a blowup in the number of edges. // So we could do this: - // CUDACHECK(cudaStreamWaitEvent(deviceStream, comm->sharedRes->scratchEvent, 0)); + // CUDACHECK(cudaStreamWaitEvent(deviceStream, finishedEvent, 0)); // But instead we do: - NCCLCHECK(ncclStreamAdvanceToEvent(planner->capturingGraph, deviceStream, comm->sharedRes->scratchEvent)); + NCCLCHECK(ncclStreamAdvanceToEvent(planner->capturingGraph, deviceStream, finishedEvent)); // Each userStream[i] waits on userStream[0] for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) { - CUDACHECK(cudaStreamWaitEvent(l->stream, comm->sharedRes->scratchEvent, 0)); + CUDACHECK(cudaStreamWaitEvent(l->stream, finishedEvent, 0)); } bool capturing = ncclCudaGraphValid(planner->capturingGraph); enum ncclImplicitOrder implicitOrder; @@ -1623,7 +1708,7 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { // Incorporate launch event into per-device (context) launch order. NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->context->launchOrder, concurrent, &launchOrder)); // If we don't have launch events (requires CUDA 12.3) then just use completion event (serialize execution). - CUDACHECK(cudaStreamWaitEvent(launchOrder, implicitOrder == ncclImplicitOrderLaunch ? comm->sharedRes->launchEvent : comm->sharedRes->scratchEvent)); + CUDACHECK(cudaStreamWaitEvent(launchOrder, implicitOrder == ncclImplicitOrderLaunch ? comm->sharedRes->launchEvent : finishedEvent)); // Release launchOrder as acquired in ncclLaunchPrepare() NCCLCHECK(ncclStrongStreamRelease(planner->capturingGraph, &comm->context->launchOrder, concurrent)); } @@ -1645,7 +1730,7 @@ static inline ncclResult_t getCollNetSupport( if (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv) { netOp = ncclSum; } - *collNetSupport = comm->collNetSupport; + *collNetSupport = comm->config.collnetEnable; switch (info->func) { case ncclFuncAllReduce: case ncclFuncReduce: @@ -1683,10 +1768,8 @@ static ncclResult_t updateCollCostTable( if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue; // CollNetDirect is only supported for up to 8 local GPUs if (a == NCCL_ALGO_COLLNET_DIRECT && comm->maxLocalRanks > NCCL_MAX_DIRECT_ARITY+1) continue; - if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1 && info->func != ncclFuncAllGather) continue; + if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && (!nvlsSupport || (info->func != ncclFuncAllReduce && comm->localRanks > NCCL_MAX_NVLS_ARITY))) continue; if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue; - /* now we only support single-node NVLS allgather and reducescatter */ - if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && (comm->nNodes > 1 || comm->nRanks > NCCL_MAX_NVLS_ARITY)) continue; /* Tree reduceScatter doesn't support scaling yet */ if (a == NCCL_ALGO_PAT && info->func == ncclFuncReduceScatter && (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv)) continue; @@ -1801,7 +1884,14 @@ static ncclResult_t getAlgoInfo( struct ncclComm* comm, struct ncclTaskColl* info, int collNetSupport, int nvlsSupport, int numPipeOps, ncclSimInfo_t* simInfo/* = NULL*/ ) { - size_t nBytes = ncclTypeSize(info->datatype)*ncclFuncMaxSendRecvCount(info->func, comm->nRanks, info->count); + size_t elementSize = ncclTypeSize(info->datatype); + size_t nBytes = elementSize * ncclFuncMaxSendRecvCount(info->func, comm->nRanks, info->count); + struct ncclReg* regSendBuf = NULL; + struct ncclReg* regRecvBuf = NULL; + int regBuff; + bool isSendValid, isRecvValid; + size_t sendbuffSize = elementSize * ncclFuncSendCount(info->func, comm->nRanks, info->count); + size_t recvbuffSize = elementSize * ncclFuncRecvCount(info->func, comm->nRanks, info->count); info->algorithm = NCCL_ALGO_UNDEF; info->protocol = NCCL_PROTO_UNDEF; int nMaxChannels = 0; @@ -1809,20 +1899,42 @@ static ncclResult_t getAlgoInfo( initCollCostTable((float **)collCostTable); NCCLCHECK(updateCollCostTable(comm, info, nBytes, collNetSupport, nvlsSupport, numPipeOps, (float **)collCostTable)); if (comm->tuner != NULL) { - size_t elementSize = ncclTypeSize(info->datatype); - size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count); - size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count); - struct ncclReg* regSendBuf; - struct ncclReg* regRecvBuf; NCCLCHECK(ncclRegFind(comm, info->sendbuff, sendbuffSize, ®SendBuf)); NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, ®RecvBuf)); - int regBuff = ((regSendBuf && regRecvBuf) || (ncclCudaGraphValid(comm->planner.capturingGraph) && ncclParamGraphRegister())); + NCCLCHECK(ncclRegLocalIsValid(regSendBuf, &isSendValid)); + NCCLCHECK(ncclRegLocalIsValid(regRecvBuf, &isRecvValid)); + regBuff = (regSendBuf && regRecvBuf && isSendValid && isRecvValid) || (ncclCudaGraphValid(comm->planner.capturingGraph) && ncclParamGraphRegister()); NCCLCHECK(comm->tuner->getCollInfo( comm->tunerContext, info->func, nBytes, numPipeOps, (float **)collCostTable, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, regBuff, &nMaxChannels)); + NCCLCHECK(topoGetAlgoInfo(comm, info, nBytes, (float **)collCostTable, simInfo)); + } else { + NCCLCHECK(topoGetAlgoInfo(comm, info, nBytes, (float **)collCostTable, simInfo)); + // NCCL_CTA_POLICY_EFFICIENCY requires user (non-symmetric) buffer registration (currently unsupported with MNNVL) + if (comm->config.CTAPolicy == NCCL_CTA_POLICY_EFFICIENCY && ncclGetEnv("NCCL_ALGO") == NULL && ncclGetEnv("NCCL_PROTO") == NULL && !comm->MNNVL) { + // make algorithm selection based on buffer registration + // there can be other specialized policies for algorithms and protocols pickup in the future + NCCLCHECK(ncclRegFind(comm, info->sendbuff, sendbuffSize, ®SendBuf)); + NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, ®RecvBuf)); + NCCLCHECK(ncclRegLocalIsValid(regSendBuf, &isSendValid)); + NCCLCHECK(ncclRegLocalIsValid(regRecvBuf, &isRecvValid)); + regBuff = (regSendBuf && regRecvBuf && isSendValid && isRecvValid) || (ncclCudaGraphValid(comm->planner.capturingGraph) && ncclParamGraphRegister()); + if (regBuff && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter)) { + if ((comm->nNodes > 1 && collNetSupport && nvlsSupport) || (comm->nNodes == 1 && nvlsSupport)) { + int recChannels; + NCCLCHECK(ncclNvlsRegResourcesQuery(comm, info, &recChannels)); + if (recChannels <= info->nMaxChannels) { + info->algorithm = NCCL_ALGO_NVLS; + info->protocol = NCCL_PROTO_SIMPLE; + info->nMaxChannels = recChannels; + info->nWarps = comm->maxThreads[info->algorithm][info->protocol] / WARP_SIZE; + } + } + } + } } - NCCLCHECK(topoGetAlgoInfo(comm, info, nBytes, (float **)collCostTable, simInfo)); + info->nMaxChannels = nMaxChannels == 0 ? info->nMaxChannels : nMaxChannels; return ncclSuccess; } @@ -1892,16 +2004,20 @@ static ncclResult_t calcCollChunking( while (nBytes / (nChannels * chunkSize) < comm->channels[0].collnetChain.depth * 8 && chunkSize > 65536) chunkSize /= 2; while (nBytes / (nChannels * chunkSize) < comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2; } else if (info->algorithm == NCCL_ALGO_NVLS) { - int maxChunkSize = comm->nvlsChunkSize; - if (comm->nNodes > 1 && comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768; - if (chunkSize > maxChunkSize) chunkSize = maxChunkSize; - // Use uint64_t so that concurrentOps*chunkSize*X does not overflow. - // However, nChannels * comm->channels[0].nvls.nHeads should easily fit in 32 bits. - // coverity[overflow_before_widen] - uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads; - if ((nBytes < (64 * (concurrentOps * chunkSize))) && (chunkSize > 65536)) chunkSize = 65536; - if ((nBytes < (8 * (concurrentOps * chunkSize))) && (chunkSize > 32768)) chunkSize = 32768; - if ((nBytes < (2 * (concurrentOps * chunkSize))) && (chunkSize > 16384)) chunkSize = 16384; + if ((info->regBufType & NCCL_NVLS_REG_BUFFER) && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter)) { + chunkSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS; + } else { + int maxChunkSize = comm->nvlsChunkSize; + if (comm->nNodes > 1 && comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768; + if (chunkSize > maxChunkSize) chunkSize = maxChunkSize; + // Use uint64_t so that concurrentOps*chunkSize*X does not overflow. + // However, nChannels * comm->channels[0].nvls.nHeads should easily fit in 32 bits. + // coverity[overflow_before_widen] + uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads; + if ((nBytes < (64 * (concurrentOps * chunkSize))) && (chunkSize > 65536)) chunkSize = 65536; + if ((nBytes < (8 * (concurrentOps * chunkSize))) && (chunkSize > 32768)) chunkSize = 32768; + if ((nBytes < (2 * (concurrentOps * chunkSize))) && (chunkSize > 16384)) chunkSize = 16384; + } } else if (info->algorithm == NCCL_ALGO_NVLS_TREE) { // Use uint64_t so that concurrentOps*chunkSize*X does not overflow. // However, nChannels * comm->channels[0].nvls.nHeads should easily fit in 32 bits. @@ -2045,7 +2161,7 @@ static ncclResult_t calcCollChunking( proxyOp->reg = 0; } - if (pattern == ncclPatternCollnetDirect) { + if (pattern == ncclPatternCollnetDirect || pattern == ncclPatternNvls) { proxyOp->specifics.collnetDirect.nNodes = comm->nNodes; proxyOp->specifics.collnetDirect.node = comm->node; if (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) { @@ -2168,7 +2284,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { bool isSendNotRecv = info->coll == ncclFuncSend; // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`. - ncclGroupCommJoin(info->comm); + ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective); struct ncclTaskP2p* p2p = ncclMemoryPoolAlloc(&comm->memPool_ncclTaskP2p, &comm->memPermanent); p2p->func = info->coll; p2p->buff = (void*)info->recvbuff; @@ -2235,7 +2351,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { return ncclSuccess; } else { // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`. - ncclGroupCommJoin(info->comm); + ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective); struct ncclTaskColl* t = ncclMemoryPoolAlloc(&comm->memPool_ncclTaskColl, &comm->memPermanent); t->func = info->coll; t->sendbuff = info->sendbuff; diff --git a/src/graph/connect.cc b/src/graph/connect.cc index 76b508c2d..152739b0c 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -258,7 +258,7 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead channel->nvls.out = -1; // NVLS+SHARP not yet implemented. channel->nvls.headRank = headRank; channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1; - if (comm->collNetSupport && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks; + if (comm->config.collnetEnable && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks; } if (comm->nNodes == 1) return ncclSuccess; @@ -330,7 +330,7 @@ int ncclMinNchannels() { if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings(); if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels(); if (minNchannels > MAXCHANNELS) { - WARN("User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS); + INFO(NCCL_GRAPH|NCCL_ENV, "User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS); minNchannels = MAXCHANNELS; } if (minNchannels < 0) minNchannels = 0; @@ -346,7 +346,7 @@ int ncclMaxNchannels() { maxNchannels = std::min(maxNchannels, ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes())); if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS; if (maxNchannels < 1) { - WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels); + INFO(NCCL_GRAPH|NCCL_ENV, "User asked for a maximum of %d channels, setting it to 1", maxNchannels); maxNchannels = 1; } return maxNchannels; @@ -379,7 +379,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa int nNodes = comm->nNodes; int nChannels = comm->nChannels; int minHeadNum = INT_MAX; - int shared = parent && parent->nvlsSupport && parent->config.splitShare; + int shared = parent && parent->nvlsSupport && parent->shareResources; NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS)); NCCLCHECKGOTO(ncclCalloc(&ringSend, nNodes*MAXCHANNELS), ret, fail); NCCLCHECKGOTO(ncclCalloc(&ringPrev, nranks*MAXCHANNELS), ret, fail); @@ -452,7 +452,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2); // Setup CollNet - if (comm->collNetSupport == 1) { + if (comm->config.collnetEnable) { struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN]; // Add more channels to saturate intra-node bandwidth, except the 1 PPN case if (collNetChainGraph->bwIntra > collNetChainGraph->bwInter && comm->nRanks > comm->nNodes) { diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 998371247..bc5cc755e 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -214,7 +214,7 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE const char* str = ncclGetEnv(disableEnv); if (str) { int disable = strtol(str, NULL, 0); - if (disable == 1) l = 0; + if (disable == 1) l = PATH_LOC; if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %d", disableEnv, disable); } } @@ -247,7 +247,18 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0); -int ncclTopoUserP2pLevel = -1; +static int ncclTopoUserP2pLevel = -1; // Initially "uninitialized". When initialized but unset, changes to -2. + +// Gets the user-provided value of NCCL_P2P_LEVEL/NCCL_P2P_DISABLE. If the user did not provide any, the value +// of the "level" argument is left unchanged. +ncclResult_t ncclGetUserP2pLevel(int* level) { + if (ncclTopoUserP2pLevel == -1) + NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL")); + if (ncclTopoUserP2pLevel != -2) + *level = ncclTopoUserP2pLevel; + return ncclSuccess; +} + ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank) { int mnnvl = 0; @@ -275,9 +286,9 @@ ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* syst // Get GPUs from topology int g1, g2; - NCCLCHECK(ncclTopoRankToIndex(system, rank1, &g1)); + NCCLCHECK(ncclTopoRankToIndex(system, rank1, &g1, /*showWarn=*/true)); struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1; - if (ncclTopoRankToIndex(system, rank2, &g2) == ncclInternalError) { + if (ncclTopoRankToIndex(system, rank2, &g2, /*showWarn=*/false) == ncclInternalError) { // GPU not found, we can't use p2p. return ncclSuccess; } @@ -302,15 +313,8 @@ ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* syst if ((arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD) && system->nodes[GPU].count <= 2) p2pLevel = PATH_SYS; // User override - if (ncclTopoUserP2pLevel == -1) - NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL")); - if (ncclTopoUserP2pLevel != -2) { - p2pLevel = ncclTopoUserP2pLevel; - goto compare; - } + NCCLCHECK(ncclGetUserP2pLevel(&p2pLevel)); - -compare: // Compute the PCI distance and compare with the p2pLevel. if (path->type <= p2pLevel) *p2p = 1; @@ -378,7 +382,8 @@ NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2); int ncclTopoUserGdrLevel = -1; const char* ncclTopoGdrModeStr[ncclTopoGdrModeNum] = { "Disabled", "Default", "PCI" }; -NCCL_PARAM(NetGdrC2c, "NET_GDR_C2C", 0); +// On C2C platforms use GDRDMA on NICs which are connected to the CPUs +NCCL_PARAM(NetGdrC2c, "NET_GDR_C2C", 1); ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode) { *gdrMode = ncclTopoGdrModeDisable; @@ -387,7 +392,7 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t n int n, g; NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n)); struct ncclTopoNode* net = system->nodes[NET].nodes+n; - NCCLCHECK(ncclTopoRankToIndex(system, rank, &g)); + NCCLCHECK(ncclTopoRankToIndex(system, rank, &g, /*showWarn=*/true)); struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; // Check that both the NIC and GPUs support it @@ -423,29 +428,29 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t n // In case of PXN, use the intermediate GPU distance instead int proxyRank; NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netId, &proxyRank)); - NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g)); + NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g, /*showWarn=*/true)); gpu = system->nodes[GPU].nodes+g; distance = gpu->paths[NET][n].type; } - int c; - NCCLCHECK(ncclGetLocalCpu(system, g, &c)); - if (ncclParamNetGdrC2c() && distance == PATH_PHB && gpu->paths[CPU][c].type == PATH_C2C) { - // On C2C platforms we can still use GDRDMA on NICs connected to the CPUs - INFO(NCCL_NET, "GPU %d / HCA %lx connected to CPU %d via C2C link", rank, netId, c); + // On C2C platforms we can still use GDRDMA on NICs connected to the CPUs + if (ncclParamNetGdrC2c() && distance == PATH_P2C) { + INFO(NCCL_GRAPH | NCCL_NET, "GPU %d / HCA %lx connected via C2C link", rank, netId); distance = PATH_C2C; } if (distance > netGdrLevel) { - INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %d / HCA %lx (distance %d > %d)", rank, netId, distance, netGdrLevel); + INFO(NCCL_GRAPH|NCCL_NET,"GPU Direct RDMA Disabled for GPU %d / HCA %lx (distance %d > %d)", rank, netId, distance, netGdrLevel); return ncclSuccess; } // Force PCIe mapping if path goes through PCI on a C2C system + int c; + NCCLCHECK(ncclGetLocalCpu(system, g, &c)); if (gpu->paths[CPU][c].type == PATH_C2C && distance != PATH_C2C) *gdrMode = ncclTopoGdrModePci; else *gdrMode = ncclTopoGdrModeDefault; - INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d mode %s", rank, netId, distance, netGdrLevel, read, ncclTopoGdrModeStr[*gdrMode]); + INFO(NCCL_GRAPH|NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d mode %s", rank, netId, distance, netGdrLevel, read, ncclTopoGdrModeStr[*gdrMode]); return ncclSuccess; } @@ -480,7 +485,7 @@ ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int64_t netId, int netDev, if (props.forceFlush == 1 || ncclParamNetForceFlush()) return ncclSuccess; int g; struct ncclTopoSystem* system = comm->topo; - NCCLCHECK(ncclTopoRankToIndex(system, rank, &g)); + NCCLCHECK(ncclTopoRankToIndex(system, rank, &g, /*showWarn=*/true)); struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; // Flush is required on Ampere and earlier if (gpu->gpu.cudaCompCap >= 90) *flush = 0; @@ -506,8 +511,8 @@ ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank *net = 1; // First check the current GPU-to-GPU speed. int g1, g2; - if (ncclTopoRankToIndex(system, rank1, &g1) != ncclSuccess || - ncclTopoRankToIndex(system, rank2, &g2) != ncclSuccess) { + if (ncclTopoRankToIndex(system, rank1, &g1, /*showWarn=*/false) != ncclSuccess || + ncclTopoRankToIndex(system, rank2, &g2, /*showWarn=*/false) != ncclSuccess) { return ncclSuccess; } @@ -533,7 +538,7 @@ ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank // Get GPU and NET int n, g; NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n)); - NCCLCHECK(ncclTopoRankToIndex(system, rank, &g)); + NCCLCHECK(ncclTopoRankToIndex(system, rank, &g, /*showWarn=*/true)); struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; struct ncclTopoLinkList* path = gpu->paths[NET]+n; if (path->type == PATH_PXN) { @@ -601,6 +606,8 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, return ncclSuccess; } +NCCL_PARAM(PxnC2c, "PXN_C2C", 0); + ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm) { // Precompute paths between GPUs/NICs. @@ -659,6 +666,20 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm } } } + // update the GPU -> NIC path in the case of C2C + PHB + for (int n = 0; n < system->nodes[NET].count; n++) { + struct ncclTopoNode* netNode = system->nodes[NET].nodes + n; + for (int g = 0; g < system->nodes[GPU].count; g++) { + struct ncclTopoNode* gpuNode = system->nodes[GPU].nodes + g; + int c; + NCCLCHECK(ncclGetLocalCpu(system, g, &c)); + if (c == -1) continue; + if (gpuNode->paths[NET][n].type == PATH_PHB && gpuNode->paths[CPU][c].type == PATH_C2C) { + gpuNode->paths[NET][n].type = PATH_P2C; + netNode->paths[GPU][g].type = PATH_P2C; + } + } + } // Update paths for NICs (no GPU Direct, PXN, ...) for (int n=0; nnodes[NET].count; n++) { @@ -674,15 +695,20 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm // PXN = PCI + NVLink. struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+localGpuIndex; // Only use PXN for NIC n if remote GPU p ... - if (peerNode->paths[NET][n].type <= PATH_PXB && // Is connected to the NIC through PCI - peerNode->paths[GPU][g].type <= PATH_NVL && // Is connected to us through NVLink - NCCL_TOPO_ID_SYSTEM_ID(peerNode->id) == NCCL_TOPO_ID_SYSTEM_ID(gpu->id) && // Is on the same node as us - (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || // Has either higher BW to that NIC - gpu->paths[NET][n].type > PATH_PXB)) // or avoids going through a CPU - // We can use that GPU as relay to communicate with that NIC. - // Only enabling it in the GPU->NIC direction for now to favor - // receiving locally and sending remotely (consistent with net.cc) - NCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n)); + if (/* (1) is either connected to the NIC with PXB*/ + (peerNode->paths[NET][n].type <= PATH_PXB || + /* or with P2C and PxN over C2C is enabled */ + (ncclParamPxnC2c() && peerNode->paths[NET][n].type == PATH_P2C)) && + /* and (2) is connected to us through NVLink */ + peerNode->paths[GPU][g].type <= PATH_NVL && + /* and (3) is on the same node as us */ + NCCL_TOPO_ID_SYSTEM_ID(peerNode->id) == NCCL_TOPO_ID_SYSTEM_ID(gpu->id) && + /* and (4) has either higher bw to that NIC or avoid going through the CPU*/ + (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || gpu->paths[NET][n].type > PATH_PXB)) + // We can use that GPU as relay to communicate with that NIC. + // Only enabling it in the GPU->NIC direction for now to favor + // receiving locally and sending remotely (consistent with net.cc) + NCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n)); } } if (gpu->paths[NET][n].type < PATH_PHB) { @@ -761,7 +787,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gp int peer; struct ncclTopoSystem* system = comm->topo; struct ncclTopoLinkList* path = NULL; - if (ncclTopoRankToIndex(system, peerRank, &peer) == ncclSuccess) { + if (ncclTopoRankToIndex(system, peerRank, &peer, /*showWarn=*/false) == ncclSuccess) { // Same rank if (g == peer) { *nChannels = -1; diff --git a/src/graph/search.cc b/src/graph/search.cc index 15a01243f..9d8ad3ff8 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -137,6 +137,7 @@ static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncc float bw = intra ? graph->bwIntra : graph->bwInter; int type = intra ? graph->typeIntra : graph->typeInter; + if (path->type >= PATH_DIS) return ncclSuccess; if (mult == 1 && (path->type > type)) return ncclSuccess; if (mult == 1 && (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE || graph->pattern == NCCL_TOPO_PATTERN_TREE || @@ -328,8 +329,7 @@ ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopo *g = i; return ncclSuccess; } - if (*g == -1) return ncclInternalError; - return ncclSuccess; + return ncclInternalError; } ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time); @@ -658,24 +658,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo } // Then try the most local GPUs - float maxBw = 0; - int minHops = 0xfffffff; - struct ncclTopoLinkList* paths = net->paths[GPU]; - for (int g=0; gnodes[GPU].count; g++) { - if (paths[g].bw > maxBw) { - maxBw = paths[g].bw; - minHops = paths[g].count; - } else if (paths[g].bw == maxBw && paths[g].count > 0 && paths[g].count < minHops) { - minHops = paths[g].count; - } - } - if (maxBw >= bw) { - for (int i=0; inodes[GPU].count; i++) { - int g = (graph->nChannels+i)%system->nodes[GPU].count; - if (paths[g].bw == maxBw && paths[g].count == minHops) { - NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g)); - } - } + int localGpus[NCCL_TOPO_MAX_NODES], localGpuCount, pathType; + NCCLCHECK(ncclTopoGetLocal(system, NET, n, GPU, localGpus, &localGpuCount, &pathType)); + // if no GPUs are connected, skip this net + if (pathType == PATH_DIS) continue; + for (int g = 0; g < localGpuCount; ++g) { + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, localGpus[g])); } } } @@ -762,6 +750,7 @@ struct kvDict kvDictLinkType[] = { { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "PXN", PATH_PXN }, + { "P2C", PATH_P2C }, { "PHB", PATH_PHB }, { "SYS", PATH_SYS }, { NULL, 0 } @@ -920,8 +909,8 @@ float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, #define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float)) #define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float)) -float sm100SpeedArrayIntra[] = { 90.0, 80.0, 70.0, 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 19.0 }; -float sm100SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; +float sm100SpeedArrayIntra[] = { 90.0, 80.0, 70.0, 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 19.0, 18.0 }; +float sm100SpeedArrayInter[] = { 47.9, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; #define NSPEEDSINTRA_SM100 (sizeof(sm100SpeedArrayIntra)/sizeof(float)) #define NSPEEDSINTER_SM100 (sizeof(sm100SpeedArrayInter)/sizeof(float)) @@ -1060,13 +1049,13 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph int maxIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : maxTypeIntra; if (tmpGraph.typeIntra < maxIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) { tmpGraph.typeIntra += 1; - goto search; + if (tmpGraph.typeIntra < PATH_DIS) goto search; } tmpGraph.typeIntra = minTypeIntra; if (system->nodes[NET].count > 0 && tmpGraph.typeInter < maxTypeInter && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) { tmpGraph.typeInter += 1; - goto search; + if (tmpGraph.typeInter < PATH_DIS) goto search; } tmpGraph.typeInter = minTypeInter; @@ -1124,7 +1113,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph } if (graph->nChannels == 0 && graph->collNet == 0 && graph->pattern != NCCL_TOPO_PATTERN_NVLS) { - WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern); + INFO(NCCL_GRAPH, "Could not find a path for pattern %d, falling back to simple order", graph->pattern); for (int i=0; iintra[i] = system->nodes[GPU].nodes[i].gpu.rank; graph->inter[0] = graph->inter[1] = 0; graph->bwIntra = graph->bwInter = 0.1; @@ -1248,7 +1237,7 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG } if (pxnLevel == 1) { int g, n; - NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g)); + NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g, /*showWarn=*/true)); NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netId, &n)); struct ncclTopoNode* gpu = comm->topo->nodes[GPU].nodes+g; if (gpu->paths[NET][n].type <= PATH_PXN) { @@ -1260,7 +1249,7 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG // Check which local GPU corresponds to that NIC and see if we can use PXN. int n, g1, g2; NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netId, &n)); - NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1)); + NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1, /*showWarn=*/true)); NCCLCHECK(ncclTopoGetLocalGpu(comm->topo, netId, &g2)); if (g2 != -1) { struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2; diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 9499f396d..9fe81bbcd 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -9,12 +9,10 @@ #include "topo.h" #include "comm.h" #include "nvmlwrap.h" -#include "net.h" #include "coll_net.h" #include "transport.h" #include #include -#include "xml.h" #include "cpuset.h" #include "bootstrap.h" @@ -22,8 +20,8 @@ #define BUSID_REDUCED_SIZE (sizeof("0000:00")) const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" }; -const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "C2C", "PCI", "", "", "", "SYS", "NET" }; -const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" }; +const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "C2C", "PCI", "", "", "", "", "SYS", "NET" }; +const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "PXN", "P2C", "PHB", "SYS", "NET", "DIS" }; /******************************************************************/ /******************* Graph Creation Functions *********************/ @@ -251,7 +249,7 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) { pciSwitch->pci.device |= 0xffff; free(subSwIds); // Restart, as system->nodes[PCI].nodes has changed. - s = 0; + s = -1; // Will be incremented to 0 in the next loop iteration continue; fail: free(subSwIds); @@ -404,7 +402,9 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s return ncclSuccess; } -struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } }; +#define PCI_BRIDGE_DEVICE_CLASS "0x060400" + +struct kvDict kvDictPciClass[] = { { PCI_BRIDGE_DEVICE_CLASS, PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } }; struct kvDict kvDictPciGen[] = { { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */ { "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 }, @@ -699,6 +699,7 @@ static ncclResult_t xmlInitAttrInt(struct ncclXmlNode* node, const char* attrNam if (index == -1) { index = node->nAttrs++; strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); + node->attrs[index].key[MAX_STR_LEN] = '\0'; snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value); } return ncclSuccess; @@ -709,6 +710,7 @@ static ncclResult_t xmlInitAttrUint64(struct ncclXmlNode* node, const char* attr if (index == -1) { index = node->nAttrs++; strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); + node->attrs[index].key[MAX_STR_LEN] = '\0'; snprintf(node->attrs[index].value, MAX_STR_LEN, "0x%lx", value); } return ncclSuccess; @@ -719,6 +721,7 @@ static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrN if (index == -1) { index = node->nAttrs++; strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); + node->attrs[index].key[MAX_STR_LEN] = '\0'; snprintf(node->attrs[index].value, MAX_STR_LEN, "%f", value); } return ncclSuccess; @@ -799,6 +802,17 @@ typedef struct xmlNodeStack { } xmlNodeStack; +ncclResult_t ncclFindFirstPciParent(ncclXmlNode** parent) { + ncclXmlNode* newParent = *parent; + while (strcmp(newParent->name, "pci") != 0) { + newParent = newParent->parent; + if (newParent == nullptr) return ncclSuccess; + if (strcmp(newParent->name, "system") == 0) return ncclSuccess; + } + *parent = newParent; + return ncclSuccess; +} + // 1. Find the common parent xmlNode between the given set of nodes ncclResult_t ncclTopoGetPath(ncclXmlNode** nodes, int nNodes, int* path, ncclXmlNode** parent) { // Track a stack of parents per-net node being merged @@ -897,6 +911,7 @@ ncclResult_t ncclTopoGetPath(ncclXmlNode** nodes, int nNodes, int* path, ncclXml } out: + ncclFindFirstPciParent(&common); *parent = common; free(parents); return ncclSuccess; @@ -960,13 +975,19 @@ ncclResult_t ncclTopoMakePciParent(struct ncclXml* xml, struct ncclXmlNode** par return ncclSuccess; } -ncclResult_t ncclTopoMakeVnic(ncclComm_t comm, struct ncclXml* xml, ncclNetVDeviceProps_t* vProps, -struct ncclXmlNode** physNetNodes, struct ncclXmlNode** netNode, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) { +ncclResult_t ncclTopoMakeVnic(struct ncclXml* xml, ncclNetVDeviceProps_t* vProps, +struct ncclXmlNode** physNetNodes, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) { if (vProps->ndevs > NCCL_NET_MAX_DEVS_PER_NIC) { WARN("TOPO/NET : Tried to merge too many NICs. %d > %d", vProps->ndevs, NCCL_NET_MAX_DEVS_PER_NIC); return ncclInternalError; } + // Don't make vNics of size 1 + if (vProps->ndevs == 1) { + TRACE(NCCL_GRAPH, "TOPO/NET : Skipping vNic of size 1"); + return ncclSuccess; + } + // Trigger the merge, then get the new device's properties int vDevIndex = 0; ncclResult_t ret = makeVDevice(&vDevIndex, vProps); @@ -976,11 +997,18 @@ struct ncclXmlNode** physNetNodes, struct ncclXmlNode** netNode, ncclResult_t (* return ret; } + // Mark original NICs as keep="0" in the topology + for (int i = 0; i < vProps->ndevs; i++) { + int dev = vProps->devs[i]; + struct ncclXmlNode* netNode = physNetNodes[dev]; + NCCLCHECK(xmlSetAttrInt(netNode, "keep", 0)); + } + INFO(NCCL_GRAPH, "TOPO/NET : Made vNic %d", vDevIndex); return ncclSuccess; } -ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, const char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) { +ncclResult_t ncclTopoForceMerge(struct ncclXml* xml, char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) { ncclResult_t ret = ncclSuccess; INFO(NCCL_ENV|NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str); char* ncStr; @@ -1018,8 +1046,7 @@ ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, const char goto fail; } - struct ncclXmlNode* netNode; - ret = ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice); + ret = ncclTopoMakeVnic(xml, &vProps, physNetNodes, makeVDevice); if (ret == ncclSuccess) { // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this) for (int i = 0; i < vProps.ndevs; i++) { @@ -1041,7 +1068,7 @@ ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, const char goto exit; } -ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) { +ncclResult_t ncclTopoAutoMerge(struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) { // Compute the path type between each device int* paths = NULL; ncclResult_t res = ncclSuccess; @@ -1085,8 +1112,7 @@ ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLe return ncclInternalError; } - struct ncclXmlNode* netNode; - ncclResult_t ret = ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice); + ncclResult_t ret = ncclTopoMakeVnic(xml, &vProps, physNetNodes, makeVDevice); // Merging failed. // Mark all as unplaced and increase their distance to disconnected (PATH_DIS) @@ -1118,6 +1144,7 @@ struct kvDict nicPathKvList[] = { { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "PXN", PATH_PXN }, + { "P2C", PATH_P2C }, { "PHB", PATH_PHB }, { "SYS", PATH_SYS }, { NULL, 0 } @@ -1139,14 +1166,19 @@ ncclResult_t ncclTopoGetVNicParent(struct ncclXml* xml, ncclResult_t (*getProper if (path == PATH_LOC) { *parent = NULL; } else if (parent && strcmp((*parent)->name, "pci") == 0) { - // If the common parent is PCI, we must reparent the new NIC under a made up busId - NCCLCHECK(ncclTopoMakePciParent(xml, parent, physNetNodes[0])); + // Compare PCI class here to avoid NCCL WARN when the "class" attribute doesn't exist + const char* c; + NCCLCHECK(xmlGetAttrStr(*parent, "class", &c)); + if (strcmp(c, PCI_BRIDGE_DEVICE_CLASS) == 0) { + // If the common parent is a PCI switch, we must reparent the new NIC under a made up pci device with a unique busid + NCCLCHECK(ncclTopoMakePciParent(xml, parent, physNetNodes[0])); + } } TRACE(NCCL_GRAPH, "Selected parent %s with path %d", (*parent)->name, path); return ncclSuccess; } -ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*getProperties)(int, ncclNetProperties_t*), int physicalDevs) { +ncclResult_t ncclTopoMakeVNics(struct ncclXml* xml, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*getProperties)(int, ncclNetProperties_t*), int physicalDevs) { int* placedDevs = NULL; struct ncclXmlNode** physNetNodes = NULL; if (physicalDevs == 0) return ncclSuccess; @@ -1170,15 +1202,15 @@ ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_ { // Avoids warnings related to jumping to "out" const char* mergeLevelEnv = ncclGetEnv("NCCL_NET_MERGE_LEVEL"); if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList); - const char* forceMerge = ncclGetEnv("NCCL_NET_FORCE_MERGE"); + char* forceMerge = (char*) ncclGetEnv("NCCL_NET_FORCE_MERGE"); NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs)); memset(placedDevs, 0, sizeof(int)*physicalDevs); if (forceMerge) { - NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out); + NCCLCHECKGOTO(ncclTopoForceMerge(xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out); } } - NCCLCHECKGOTO(ncclTopoAutoMerge(comm, xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out); + NCCLCHECKGOTO(ncclTopoAutoMerge(xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out); out: free(physNetNodes); @@ -1187,7 +1219,7 @@ ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_ return res; } -static ncclResult_t ncclTopoPopulateNics(ncclComm_t comm, ncclXml* xml, int startIndex, int endIndex, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), const char* netName, int coll, int keep, int virtualNics) { +static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIndex, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), const char* netName, int coll, int virtualNics, bool dmaBufSupport) { for (int n = startIndex; n < endIndex; n++) { ncclNetProperties_t props; NCCLCHECK(getProperties(n, &props)); @@ -1206,15 +1238,17 @@ static ncclResult_t ncclTopoPopulateNics(ncclComm_t comm, ncclXml* xml, int star const char* colAttr; NCCLCHECK(xmlGetAttr(netNode, "coll", &colAttr)); - // If coll == 0 but the netNode is tagged as coll, don't update the keep value - if (colAttr == NULL || coll != 0 || strcmp(colAttr,"1") != 0) NCCLCHECK(xmlSetAttrInt(netNode, "keep", keep)); + NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1)); + int dev; + xmlGetAttrIntDefault(netNode, "dev", &dev, -1); + if (dev != -1 && dev != n) INFO(NCCL_GRAPH, "TOPO/NET : Changing %s dev index from %d to %d", netName, dev, n); NCCLCHECK(xmlSetAttrInt(netNode, "dev", n)); NCCLCHECK(xmlInitAttrInt(netNode, "latency", props.latency)); NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed)); NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port)); NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid)); NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms)); - bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF)); + bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF)); INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", netName, gdrSupport ? "Enabled" : "Disabled", n, props.name); NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport)); // Only set coll if it's not 0 @@ -1230,30 +1264,22 @@ static ncclResult_t ncclTopoPopulateNics(ncclComm_t comm, ncclXml* xml, int star return ncclSuccess; } -struct ncclTopoNetState { - int nVirtualNics; - int nPhysicalNics; - const char* name; -}; - // Calls to network plugin APIs should be protected. This function should be called inside a per-process lock. -static ncclResult_t ncclTopoProcessNet(ncclComm_t comm, ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName) { +ncclResult_t ncclTopoProcessNet(ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName, bool dmaBufSupport) { int usePhysicalDevices = (dumpXmlFile || makeVDevice == NULL); if (state->nPhysicalNics == -1) NCCLCHECK(devices(&state->nPhysicalNics)); // Enumerate physical devices - NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, getProperties, netName, coll, 1, 0)); + NCCLCHECK(ncclTopoPopulateNics(xml, 0, state->nPhysicalNics, getProperties, netName, coll, false, dmaBufSupport)); if (!usePhysicalDevices) { if (state->nVirtualNics == -1) { - NCCLCHECK(ncclTopoMakeVNics(comm, xml, makeVDevice, getProperties, state->nPhysicalNics)); + NCCLCHECK(ncclTopoMakeVNics(xml, makeVDevice, getProperties, state->nPhysicalNics)); int nDevs; NCCLCHECK(devices(&nDevs)); state->nVirtualNics = nDevs - state->nPhysicalNics; } - // Remove keep=1 for physical collnets if (state->nVirtualNics > 0) { - NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, getProperties, netName, coll, 0, 0)); // Populate new devices - NCCLCHECK(ncclTopoPopulateNics(comm, xml, state->nPhysicalNics, state->nPhysicalNics+state->nVirtualNics, getProperties, netName, coll, 1, 1)); + NCCLCHECK(ncclTopoPopulateNics(xml, state->nPhysicalNics, state->nPhysicalNics+state->nVirtualNics, getProperties, netName, coll, true, dmaBufSupport)); } } @@ -1301,6 +1327,15 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy // Try default XML topology location NCCLCHECKGOTO(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0), ret, fail); } + // Fixup the cpu's host_hashes. + struct ncclXmlNode* node; + // Update every cpu node's host_hash attribute since those are not + // intended to be preserved from the XML files that have been read. + NCCLCHECKGOTO(xmlFindTag(xml, "cpu", &node), ret, fail); + while (node != nullptr) { + NCCLCHECKGOTO(xmlSetAttrLong(node, "host_hash", getHostHash()), ret, fail); + NCCLCHECKGOTO(xmlFindNextTag(xml, "cpu", node, &node), ret, fail); + } if (xml->maxIndex == 0) { // Create top tag struct ncclXmlNode* top; @@ -1313,7 +1348,6 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy // Detect only the GPU managed by this process. We'll get any others through XML fusion. char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; NCCLCHECKGOTO(int64ToBusId(comm->peerInfo[comm->rank].busId, busId), ret, fail); - struct ncclXmlNode* node; NCCLCHECKGOTO(ncclTopoFillGpu(xml, busId, &node), ret, fail); if (node) { NCCLCHECKGOTO(xmlSetAttrInt(node, "keep", 1), ret, fail); @@ -1330,12 +1364,12 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy state = NULL; if (collNetSupport(comm)) { NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclCollNet->name, collNetStates), ret, fail); - NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 1, dumpXmlFile, state, - comm->ncclCollNet->getProperties, comm->ncclCollNet->makeVDevice, comm->ncclCollNet->devices, comm->ncclCollNet->name), ret, fail); + NCCLCHECKGOTO(ncclTopoProcessNet(xml, 1, dumpXmlFile, state, + comm->ncclCollNet->getProperties, comm->ncclCollNet->makeVDevice, comm->ncclCollNet->devices, comm->ncclCollNet->name, comm->dmaBufSupport), ret, fail); } NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclNet->name, netStates), ret, fail); - NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 0, dumpXmlFile, state, - comm->ncclNet->getProperties, comm->ncclNet->makeVDevice, comm->ncclNet->devices, comm->ncclNet->name), ret, fail); + NCCLCHECKGOTO(ncclTopoProcessNet(xml, 0, dumpXmlFile, state, + comm->ncclNet->getProperties, comm->ncclNet->makeVDevice, comm->ncclNet->devices, comm->ncclNet->name, comm->dmaBufSupport), ret, fail); pthread_mutex_unlock(&netLock); netLockHeld = 0; @@ -1399,7 +1433,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy goto exit; } -static ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, +ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType) { int minType = PATH_DIS; float maxBw = 0; @@ -1452,7 +1486,7 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) { int gpu; - NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu)); + NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu, /*showWarn=*/true)); int localNets[NCCL_TOPO_MAX_NODES]; int localNetCount; @@ -1517,7 +1551,7 @@ NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0); ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity) { struct ncclTopoNode* cpu = NULL, *gpu = NULL; int gpuIndex, cpuIndex; - NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpuIndex)); + NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpuIndex, /*showWarn=*/true)); NCCLCHECK(ncclGetLocalCpu(system, gpuIndex, &cpuIndex)); gpu = system->nodes[GPU].nodes+gpuIndex; cpu = system->nodes[CPU].nodes+cpuIndex; @@ -1529,8 +1563,8 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu #ifdef ENABLE_TRACE { char affinityStr[sizeof(cpu_set_t)*2]; - NCCLCHECK(ncclCpusetToStr(&mask, affinityStr)); - TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", gpu->gpu.dev, affinityStr); + TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", gpu->gpu.dev, + ncclCpusetToRangeStr(&mask, affinityStr, sizeof(affinityStr))); } #endif @@ -1540,8 +1574,8 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu #ifdef ENABLE_TRACE { char affinityStr[sizeof(cpu_set_t)*2]; - NCCLCHECK(ncclCpusetToStr(&cpuMask, affinityStr)); - TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", gpu->gpu.dev, affinityStr); + TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", gpu->gpu.dev, + ncclCpusetToRangeStr(&cpuMask, affinityStr, sizeof(affinityStr))); } #endif @@ -1558,8 +1592,8 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu // If there is a non empty set, use it to set affinity if (CPU_COUNT(&finalMask)) { char affinityStr[sizeof(cpu_set_t)*2]; - NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr)); - INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr); + INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev, + ncclCpusetToRangeStr(&finalMask, affinityStr, sizeof(affinityStr))); } return ncclSuccess; } diff --git a/src/graph/topo.h b/src/graph/topo.h index 921a7f5d6..07ef5e105 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -9,6 +9,8 @@ #include "graph.h" #include "core.h" +#include "xml.h" +#include "net.h" #define LOC_BW 5000.0 #define SM60_NVLINK_BW 18.0 @@ -50,9 +52,10 @@ extern const char* topoNodeTypeStr[]; #define LINK_PCI 4 // Skipping 5 for PATH_PXB // Skipping 6 for PATH_PXN -// Skipping 7 for PATH_PHB -#define LINK_SYS 8 -#define LINK_NET 9 +// Skipping 7 for PATH_P2C +// Skipping 8 for PATH_PHB +#define LINK_SYS 9 +#define LINK_NET 10 extern const char* topoLinkTypeStr[]; // Local (myself) @@ -76,20 +79,23 @@ extern const char* topoLinkTypeStr[]; // Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations. #define PATH_PXN 6 +// Connection between a GPU and a NIC using the C2C connection to the CPU and the PCIe connection to the NIC +#define PATH_P2C 7 + // Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) -#define PATH_PHB 7 +#define PATH_PHB 8 // Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) -#define PATH_SYS 8 +#define PATH_SYS 9 // Connection through the network -#define PATH_NET 9 +#define PATH_NET 10 // New type of path which should precede PATH_PIX #define PATH_PORT PATH_NVL // Disconnected -#define PATH_DIS 10 +#define PATH_DIS 11 extern const char* topoPathTypeStr[]; struct ncclTopoNode; @@ -181,6 +187,13 @@ ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max); ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink); +struct ncclTopoNetState { + int nVirtualNics; + int nPhysicalNics; + const char* name; +}; +ncclResult_t ncclTopoProcessNet(ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName, bool dmaBufSupport); + #define NCCL_TOPO_XML_MAX_NODES 256 #define NCCL_GRAPH_XML_MAX_NODES 4096 ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem, uint64_t localHostHash); @@ -200,7 +213,7 @@ static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, i return ncclInternalError; } -static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank, int* index) { +static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank, int* index, bool showWarn) { *index = -1; for (int i=0; inodes[GPU].count; i++) { if (system->nodes[GPU].nodes[i].gpu.rank == rank) { @@ -208,6 +221,7 @@ static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank, return ncclSuccess; } } + if (showWarn) WARN("ncclTopoRankToIndex could not find rank %d", rank); return ncclInternalError; } diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index 68085b893..64dc5cf22 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -16,13 +16,13 @@ static int getNthreads(const char* name, int env, int min, int max, int def) { int nt = env; if (nt > 0) { if (nt % WARP_SIZE != 0) { - WARN("Invalid %s %d (must be a multiple of %d)", name, nt, WARP_SIZE); + INFO(NCCL_GRAPH|NCCL_ENV, "Invalid %s %d (must be a multiple of %d)", name, nt, WARP_SIZE); nt = max; } else if (nt > max) { - WARN("Invalid %s %d (maximum %d).", name, nt, max); + INFO(NCCL_GRAPH|NCCL_ENV, "Invalid %s %d (maximum %d).", name, nt, max); nt = max; } else if (nt < min) { - WARN("Invalid %s %d (minimum %d).", name, nt, min); + INFO(NCCL_GRAPH|NCCL_ENV, "Invalid %s %d (minimum %d).", name, nt, min); nt = min; } } else { @@ -51,11 +51,14 @@ static int getNthreads(const char* name, int env, int min, int max, int def) { // NCCL_PROTO="^LL128;allreduce:LL128" // Enable everything but LL128, but only LL128 for allreduce. ncclResult_t parseList(const char* str, const char* prefixElems[], int nprefixes, const char* elems[], int nelems, int* list) { + ncclResult_t ret = ncclSuccess; char* fullStr = strdup(str); char* tmpFullStr; char* fullToken = strtok_r(fullStr, ";", &tmpFullStr); + char* subToken = nullptr; + char* tokStr = nullptr; while (fullToken) { - char* subToken = strdup(fullToken); + subToken = strdup(fullToken); char* tmpSubStr; char* prefix = strtok_r(subToken, ":", &tmpSubStr); char* elemList = strtok_r(NULL, ":", &tmpSubStr); @@ -65,7 +68,8 @@ ncclResult_t parseList(const char* str, const char* prefixElems[], int nprefixes // because then all the prefixes before the prefix-less entry would be // overwritten. WARN("All entries except the first must have a prefix: \"%s\"", str); - return ncclInvalidUsage; + ret = ncclInvalidUsage; + goto fail; } elemList = prefix; prefix = NULL; @@ -84,7 +88,7 @@ ncclResult_t parseList(const char* str, const char* prefixElems[], int nprefixes foundPrefix = true; for (int e=0; ebwIntra*graphs[NCCL_ALGO_RING]->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS; comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = @@ -248,7 +264,14 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom && a == NCCL_ALGO_PAT && (p != NCCL_PROTO_SIMPLE || ncclPatEnable(comm) == 0)) continue; int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0; float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter; - if (a == NCCL_ALGO_NVLS) bw = std::min(graphs[a]->bwIntra, graphs[a]->bwInter); + if (a == NCCL_ALGO_NVLS) { + if (coll == ncclFuncAllReduce) { + bw = std::min(graphs[a]->bwIntra, graphs[a]->bwInter); + } else { + // allgather and reducescatter + bw = std::min(graphs[a]->bwIntra * (ppn - 1.0f) / ppn, graphs[a]->bwInter * 0.9f); + } + } if (a == NCCL_ALGO_NVLS_TREE) bw = std::min(graphs[a]->bwIntra, nNodes <= 2 ? graphs[a]->bwInter : graphs[a]->bwInter/2); float busBw = graphs[a]->nChannels * bw; @@ -264,19 +287,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) { if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) { - busBw = ppn * bw; - // AllGather/ReduceScatter requires 1:1 GPU:NIC - int nicPerNode = comm->collNetHeadsNum; - if (coll == ncclFuncAllGather && comm->nNodes > 1) { - if (!comm->ncclCollNet || !comm->ncclCollNet->iallgather || ppn > nicPerNode) busBw = 0; - } - if (coll == ncclFuncReduceScatter && comm->nNodes > 1) { - if (!comm->ncclCollNet || !comm->ncclCollNet->ireducescatter || ppn > nicPerNode) busBw = 0; - } - // Measured corrective ratio needed at 1 ppn and 8ppn. Here we hackishly - // interpolate the two. - float w = (ppn-1)/(8-1); - busBw *= w*0.85 + (1-w)*0.95; + busBw = ppn * std::min(graphs[a]->bwIntra, graphs[a]->bwInter * 0.9f); } else { // Collnet+Direct requires all GPUs to have a local NIC to work at full speed float factor = ppn / (1.0*graphs[a]->nChannels); // GPU/NIC ratio @@ -285,6 +296,26 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom if (minCompCap >= 90) busBw *= .85; } } + // disable collnet for allgather/reducescatter if #localranks > #heads + // AllGather/ReduceScatter requires 1:1 GPU:NIC + if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_COLLNET_DIRECT) && p == NCCL_PROTO_SIMPLE && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) && comm->nNodes > 1) { + int nHeads = 0; + if (coll == ncclFuncAllGather && comm->nNodes > 1 && (!comm->ncclCollNet || !comm->ncclCollNet->iallgather)) busBw = 0.0f; + if (coll == ncclFuncReduceScatter && comm->nNodes > 1 && (!comm->ncclCollNet || !comm->ncclCollNet->ireducescatter)) busBw = 0.0f; + if (comm->config.collnetEnable) + nHeads = comm->collNetHeadsNum; + else + busBw = 0.0f; + if (busBw > 0.0f) { + for (int r = 0; r < comm->nRanks; r++) { + int node = comm->rankToNode[r]; + if (comm->nodeRanks[node].localRanks > nHeads) { + busBw = 0.0f; + break; + } + } + } + } // Convert bus BW to algorithm BW if (!(a != NCCL_ALGO_RING && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) { @@ -411,7 +442,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom // Disable NVLS Tree on a single node if (comm->nNodes == 1 && a == NCCL_ALGO_NVLS_TREE) disable = 1; // Disable Collnet+Direct, Collnet+Chain or Collnet+NVLS if collnet is not supported. - if (comm->collNetSupport == 0 && + if (comm->config.collnetEnable == 0 && (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN || (a == NCCL_ALGO_NVLS && comm->nNodes > 1))) disable = 1; @@ -426,17 +457,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom if (pEnable == 2 && p == NCCL_PROTO_LL128) { // Enable LL128 by default only on Volta/Ampere/Hopper/Blackwell+NVLink. Other cases are not tested and may cause silent data corruption. pEnable = 1; - pEnable &= (graphs[a]->typeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= PATH_PXN)); + pEnable &= (graphs[a]->typeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= (ncclParamLl128C2c() ? PATH_P2C : PATH_PXN))); pEnable &= (graphs[a]->typeIntra <= PATH_NVB); pEnable &= (minCompCap == maxCompCap); - switch (minCompCap) { - case 70: pEnable &= 1; break; - case 80: pEnable &= 1; break; - case 90: pEnable &= !(CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2); break; - case 100: pEnable &= 1; break; - case 120: pEnable &= 1; break; - default: pEnable &= 0; break; - } + pEnable &= !(minCompCap < 70 || (minCompCap == 90 && CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2)); } if (pEnable == 0) comm->bandwidths[c][a][p] = 0; if (algoEnable[c*NCCL_NUM_ALGORITHMS+a] == 0) comm->bandwidths[c][a][p] = 0; @@ -483,7 +507,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom } } } - + // Set per-thread amount of work before we increase nThreads and nChannels for (int a=0; athreadThresholds[a][NCCL_PROTO_LL] = NCCL_LL_THREAD_THRESHOLD; diff --git a/src/graph/xml.cc b/src/graph/xml.cc index a41289389..96b0c9a7c 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -39,7 +39,13 @@ ncclResult_t xmlGetValue(FILE* file, char* value, char* last) { #if INT_OK int o = 0; do { - value[o++] = c; + value[o] = c; + if (o == MAX_STR_LEN-1) { + value[o] = '\0'; + WARN("Error : value %s too long (max %d)", value, MAX_STR_LEN); + return ncclInternalError; + } + o++; NCCLCHECK(xmlGetChar(file, &c)); } while (c >= '0' && c <= '9'); value[o] = '\0'; @@ -51,10 +57,17 @@ ncclResult_t xmlGetValue(FILE* file, char* value, char* last) { #endif } int o = 0; + char quote = c; // Remember which quote type we started with do { NCCLCHECK(xmlGetChar(file, &c)); - value[o++] = c; - } while (c != '"'); + value[o] = c; + if (o == MAX_STR_LEN-1) { + value[o] = '\0'; + WARN("Error : value %s too long (max %d)", value, MAX_STR_LEN); + return ncclInternalError; + } + o++; + } while (c != quote); value[o-1] = '\0'; NCCLCHECK(xmlGetChar(file, last)); return ncclSuccess; @@ -267,7 +280,7 @@ ncclResult_t ncclTopoDumpXmlRec(int indent, FILE* file, struct ncclXmlNode* node ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml) { FILE* file = fopen(xmlTopoFile, "w"); if (file == NULL) { - WARN("Unable to open %s, not dumping topology.", xmlTopoFile); + INFO(NCCL_GRAPH|NCCL_ENV, "Unable to open %s, not dumping topology.", xmlTopoFile); return ncclSuccess; } NCCLCHECK(ncclTopoDumpXmlRec(0, file, xml->nodes)); @@ -375,7 +388,7 @@ ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml FILE* file = fopen(xmlTopoFile, "r"); if (file == NULL) { if (warn) { - WARN("Could not open XML topology file %s : %s", xmlTopoFile, strerror(errno)); + INFO(NCCL_GRAPH|NCCL_ENV, "Could not open XML topology file %s : %s", xmlTopoFile, strerror(errno)); } return ncclSuccess; } @@ -759,7 +772,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : (sm < 90) ? 12 : 18; if (maxNvLinks > 0 && nvmlDev == NULL) { - WARN("No NVML device handle. Skipping nvlink detection."); + INFO(NCCL_GRAPH, "No NVML device handle. Skipping nvlink detection."); maxNvLinks = 0; } @@ -961,8 +974,16 @@ ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node, int* keep) { NCCLCHECK(ncclTopoTrimXmlRec(subs[s], &k)); *keep += k; } - if (*keep == 0 && // Trim PCI switches or CPU with no used GPU/NIC under them. - (strcmp(node->name, "pci") == 0 || strcmp(node->name, "cpu") == 0)) { + // Remove node if it has no children and no keep attribute + if (*keep == 0 && // Trim PCI switches, CPUs with no used GPU/NIC under them, or pruned NICs + (strcmp(node->name, "pci") == 0 || strcmp(node->name, "cpu") == 0 || strcmp(node->name, "nic") == 0 || strcmp(node->name, "net") == 0)) { +#ifdef ENABLE_TRACE + const char* name; + const char* busid; + NCCLCHECK(xmlGetAttr(node, "name", &name)); + NCCLCHECK(xmlGetAttr(node, "busid", &busid)); + TRACE(NCCL_GRAPH, "Removing node %s %s %s\n", node->name, name, busid); +#endif NCCLCHECK(xmlRemoveNode(node)); } } diff --git a/src/graph/xml.h b/src/graph/xml.h index f06c0e68b..ad9f0faff 100644 --- a/src/graph/xml.h +++ b/src/graph/xml.h @@ -117,6 +117,13 @@ static ncclResult_t xmlGetAttrIntDefault(struct ncclXmlNode* node, const char* a return ncclSuccess; } +static ncclResult_t xmlGetAttrUint64(struct ncclXmlNode* node, const char* attrName, uint64_t* value) { + const char* str; + NCCLCHECK(xmlGetAttrStr(node, attrName, &str)); + *value = strtoull(str, NULL, 0); + return ncclSuccess; +} + static ncclResult_t xmlGetAttrLong(struct ncclXmlNode* node, const char* attrName, int64_t* value) { const char* str; NCCLCHECK(xmlGetAttrStr(node, attrName, &str)); @@ -124,7 +131,6 @@ static ncclResult_t xmlGetAttrLong(struct ncclXmlNode* node, const char* attrNam return ncclSuccess; } - static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrName, float* value) { const char* str; NCCLCHECK(xmlGetAttrStr(node, attrName, &str)); @@ -254,7 +260,6 @@ static ncclResult_t xmlSetAttrInt(struct ncclXmlNode* node, const char* attrName node->attrs[index].key[MAX_STR_LEN] = '\0'; } snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value); - node->attrs[index].value[MAX_STR_LEN] = '\0'; return ncclSuccess; } @@ -267,7 +272,6 @@ static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char* attrNa node->attrs[index].key[MAX_STR_LEN] = '\0'; } snprintf(node->attrs[index].value, MAX_STR_LEN, "%g", value); - node->attrs[index].value[MAX_STR_LEN] = '\0'; return ncclSuccess; } @@ -280,7 +284,6 @@ static ncclResult_t xmlSetAttrLong(struct ncclXmlNode* node, const char* attrNam node->attrs[index].key[MAX_STR_LEN] = '\0'; } snprintf(node->attrs[index].value, MAX_STR_LEN, "%#lx", value); - node->attrs[index].value[MAX_STR_LEN] = '\0'; return ncclSuccess; } diff --git a/src/group.cc b/src/group.cc index c48c0de88..08ac54e9e 100644 --- a/src/group.cc +++ b/src/group.cc @@ -12,16 +12,14 @@ #include #include "bootstrap.h" +#define GROUP_MAX_RECLAIM_STEPS 10 + __thread int ncclGroupDepth = 0; // depth of ncclGroupStart nesting __thread ncclResult_t ncclGroupError = ncclSuccess; -__thread struct ncclComm* ncclGroupCommHead = nullptr; +__thread struct ncclComm* ncclGroupCommHead[ncclGroupTaskTypeNum] = {nullptr}; __thread struct ncclComm* ncclGroupCommPreconnectHead = nullptr; __thread struct ncclIntruQueue ncclAsyncJobs; -__thread struct ncclGroupJob *ncclGroupJobMainPtr = NULL; -__thread struct ncclGroupJob ncclGroupJobMain; __thread int ncclGroupBlocking = -1; /* default mode */ -__thread bool ncclGroupJobAbortFlag = false; - void* ncclAsyncJobMain(void* arg); ncclResult_t ncclAsyncLaunch( @@ -191,6 +189,66 @@ ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) { goto exit; } +struct ncclGroupSymmetricJob { + struct ncclAsyncJob base; + struct ncclComm* comm; +}; + +NCCL_PARAM(WinStride, "WIN_STRIDE", -1); + +ncclResult_t ncclCommGroupRegisterSymmetric(struct ncclAsyncJob* job_) { + struct ncclGroupSymmetricJob* job = (struct ncclGroupSymmetricJob*)job_; + struct ncclComm* comm = job->comm; + ncclResult_t ret = ncclSuccess; + + CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail); + if (comm->baseStride == 0) { + cudaStream_t hostStream; + // first time to allocate symmetric VA space. + // calling into this function means symmetric is supported. + struct ncclSymDevBase* symBase = NULL; + size_t size = ncclSymDevBase::size(comm->localRanks); + if (ncclParamWinStride() != -1) { + comm->baseStride = ncclParamWinStride(); + } else { + size_t maxStride = 0; + for (int r = 0; r < comm->nRanks; ++r) + if (comm->peerInfo[r].totalGlobalMem > maxStride) maxStride = comm->peerInfo[r].totalGlobalMem; + comm->baseStride = maxStride; + } + INFO(NCCL_INIT, "rank %d base stride %zuGB total VM %zuGB", comm->rank, comm->baseStride >> 30, (comm->baseStride * comm->localRanks) >> 30); + NCCLCHECKGOTO(ncclIpcSymmetricInit(comm), ret, fail); + NCCLCHECKGOTO(ncclNvlsSymmetricInit(comm), ret, fail); + comm->symAllocHead = 0; + + // Allocate symmetric memory for NCCL internal usage + NCCLCHECKGOTO(ncclCommSymmetricAllocInternal(comm, size, alignof(struct ncclSymDevBase), (void**)&symBase), ret, fail); + assert((void*)symBase == (void*)(comm->baseUCSymPtr + comm->localRank * comm->baseStride)); + NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), ret, fail); + CUDACHECKGOTO(cudaMemsetAsync(symBase, 0, size, hostStream), ret, fail); + CUDACHECKGOTO(cudaStreamSynchronize(hostStream), ret, fail); + NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), ret, fail); + + comm->symDevComm.base = (struct ncclSymDevBase*)(comm->baseUCSymPtr + comm->localRank * comm->baseStride); + comm->symDevComm.baseMc = (struct ncclSymDevBase*)comm->baseMCSymPtr; + comm->symDevComm.nRanks = comm->localRanks; + comm->symDevComm.nRanks_rcp32 = idivRcp32(comm->localRanks); + comm->symDevComm.rank = comm->localRank; + comm->symDevComm.stride4G = comm->baseStride >> 32; + } + + while (!ncclIntruQueueEmpty(&comm->symRegTaskQueue)) { + struct ncclSymRegTask* task = ncclIntruQueueDequeue(&comm->symRegTaskQueue); + NCCLCHECKGOTO(ncclCommSymmetricRegisterInternal(comm, task->buff, task->baseSize, task->alignment, task->memHandle, task->regHandle), ret, fail); + free(task); + } + +exit: + return ret; +fail: + goto exit; +} + static ncclResult_t doLaunches(struct ncclComm* head) { ncclResult_t result = ncclSuccess; struct ncclComm* cliqueHead = head; @@ -207,7 +265,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) { CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure); NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure); if (useBarrier) ncclCommIntraBarrierIn(comm, 1); - comm = comm->groupNext; + comm = comm->groupNext[ncclGroupTaskTypeCollective]; } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0); cliqueNextHead = comm; @@ -224,7 +282,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) { bool moreRounds = false; comm = cliqueHead; do { // Iterate clique members. - struct ncclComm* next = comm->groupNext; + struct ncclComm* next = comm->groupNext[ncclGroupTaskTypeCollective]; if (useBarrier) { // Barrier reduction result tells us if this was the final round. moreRounds = 0 != ncclCommIntraBarrierOut(comm); @@ -259,64 +317,60 @@ static ncclResult_t doLaunches(struct ncclComm* head) { return result; } -static inline void groupResetJobState(struct ncclGroupJob* job) { - if (job) { - if (job->groupBlockingPtr) *job->groupBlockingPtr = -1; - if (job->abortFlagPtr) *job->abortFlagPtr = false; - if (job->groupErrorPtr) *job->groupErrorPtr = ncclSuccess; - if (job->groupCommHeadPtr) *job->groupCommHeadPtr = NULL; - if (job->groupCommPreconnectHeadPtr) *job->groupCommPreconnectHeadPtr = NULL; - memset(job, 0, sizeof(struct ncclGroupJob)); - } +static inline void groupLocalResetJobState() { + ncclGroupError = ncclSuccess; + for (int type = 0; type < ncclGroupTaskTypeNum; ++type) ncclGroupCommHead[type] = NULL; + ncclGroupCommPreconnectHead = NULL; + ncclGroupBlocking = -1; + ncclIntruQueueConstruct(&ncclAsyncJobs); return; } -static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** groupCommPreconnectHeadPtr, struct ncclIntruQueue* asyncJobsPtr, ncclResult_t* groupErrorPtr, int* groupBlockingPtr, volatile bool* groupJobAbortFlagPtr, ncclResult_t error) { - struct ncclComm* comm = *groupCommHeadPtr; - - /* reset all thread local variables */ - *groupCommHeadPtr = NULL; - *groupCommPreconnectHeadPtr = NULL; - *groupErrorPtr = ncclSuccess; - *groupBlockingPtr = -1; - *groupJobAbortFlagPtr = false; - - while (comm != nullptr) { - struct ncclComm* next = comm->groupNext; - (void) ncclGroupCommLeave(comm); // overwrites comm->groupNext - // We don't know if preconnect succeeded or happened at all, so clear - // the flags that let `taskAppend()` skip over checking if preconnect - // is needed. - comm->preconnectNext = reinterpret_cast(0x1); - for (int i = 0; i < comm->nRanks; i++) { - comm->connectSend[i] = 0UL; - comm->connectRecv[i] = 0UL; - } - // Reclaim abandoned kernel plan memory. Note ncclWork structs were already - // reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`. - while (!ncclIntruQueueEmpty(&comm->planner.planQueue)) { - struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planner.planQueue); - // Persistent plans will be reclaimed via the callbackQueue when the - // graph drops its UserObject reference. - if (!plan->persistent) { - while (!ncclIntruQueueEmpty(&plan->proxyOpQueue)) { - struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->proxyOpQueue); - ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop); +static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclIntruQueue* asyncJobsPtr, ncclResult_t error) { + struct ncclComm* comm; + for (int type = 0; type < ncclGroupTaskTypeNum; ++type) { + comm = groupCommHeadPtr[type]; + // reset groupCommHeadPtr[type] + groupCommHeadPtr[type] = nullptr; + while (comm != nullptr) { + struct ncclComm* next = comm->groupNext[type]; + (void)ncclGroupCommLeave(comm, type); // overwrites comm->groupNext + // We don't know if preconnect succeeded or happened at all, so clear + // the flags that let `taskAppend()` skip over checking if preconnect + // is needed. + if (type == ncclGroupTaskTypeCollective) { + comm->preconnectNext = reinterpret_cast(0x1); + for (int i = 0; i < comm->nRanks; i++) { + comm->connectSend[i] = 0UL; + comm->connectRecv[i] = 0UL; + } + // Reclaim abandoned kernel plan memory. Note ncclWork structs were already + // reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`. + while (!ncclIntruQueueEmpty(&comm->planner.planQueue)) { + struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planner.planQueue); + // Persistent plans will be reclaimed via the callbackQueue when the + // graph drops its UserObject reference. + if (!plan->persistent) { + while (!ncclIntruQueueEmpty(&plan->proxyOpQueue)) { + struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->proxyOpQueue); + ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop); + } + ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan); + } + } + + { // Reset comm->planner to empty. + ncclKernelPlanner::Peer* tmp = comm->planner.peers; + memset(&comm->planner, 0, sizeof(comm->planner)); + comm->planner.peers = tmp; + if (comm->planner.peers != NULL) memset(comm->planner.peers, 0, comm->nRanks * sizeof(comm->planner.peers[0])); } - ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan); } - } - { // Reset comm->planner to empty. - ncclKernelPlanner::Peer* tmp = comm->planner.peers; - memset(&comm->planner, 0, sizeof(comm->planner)); - comm->planner.peers = tmp; - if (comm->planner.peers != NULL) memset(comm->planner.peers, 0, comm->nRanks*sizeof(comm->planner.peers[0])); + if (!comm->config.blocking) + (void)ncclCommSetAsyncError(comm, error); + comm = next; } - - if (!comm->config.blocking) - (void) ncclCommSetAsyncError(comm, error); - comm = next; } /* reset everything */ @@ -393,11 +447,10 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueuegroupCommHeadPtr; - struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr; - struct ncclIntruQueue *asyncJobsMain = gjob->asyncJobsPtr; - - bool *groupAbortFlag = gjob->abortFlagPtr; + struct ncclComm **groupCommHeadMain = gjob->groupCommHead; + struct ncclComm *groupCommPreconnectHeadMain = gjob->groupCommPreconnectHead; + struct ncclIntruQueue *asyncJobsMain = &gjob->asyncJobs; + bool *groupAbortFlag = &gjob->abortFlag; if (!simInfo && groupCommPreconnectHeadMain != nullptr) { struct ncclComm* comm = groupCommPreconnectHeadMain; @@ -421,9 +474,41 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail); + // only loop through sym alloc and register tasks + for (int type = ncclGroupTaskTypeSymRegister; type <= ncclGroupTaskTypeSymRegister; ++type) { + if (groupCommHeadMain[type]) { + struct ncclComm* cliqueHead = groupCommHeadMain[type]; + struct ncclComm* comm = NULL; + struct ncclIntruQueue asyncSymJobs; + ncclIntruQueueConstruct(&asyncSymJobs); + do { + comm = cliqueHead; + do { + struct ncclGroupSymmetricJob* job; + NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail); + job->base.func = ncclCommGroupRegisterSymmetric; + job->base.undo = nullptr; + job->base.destructor = free; + job->base.state = ncclGroupJobRunning; + job->base.abortFlag = comm->abortFlag; + job->base.abortFlagDev = comm->abortFlagDev; + job->comm = comm; + ncclIntruQueueEnqueue(&asyncSymJobs, (struct ncclAsyncJob*)job); + comm = comm->groupNext[type]; + } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0); + NCCLCHECKGOTO(asyncJobLaunch(&asyncSymJobs, groupAbortFlag), ret, fail); + while (!ncclIntruQueueEmpty(&asyncSymJobs)) { + struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncSymJobs); + if (job->destructor) job->destructor((void*)job); + } + cliqueHead = comm; + } while (cliqueHead != nullptr); + } + } + /* Connect channels at runtime if cumem is supported */ - if (groupCommHeadMain != nullptr) { - struct ncclComm* cliqueHead = groupCommHeadMain; + if (groupCommHeadMain[ncclGroupTaskTypeCollective] != nullptr) { + struct ncclComm* cliqueHead = groupCommHeadMain[ncclGroupTaskTypeCollective]; struct ncclComm* comm = NULL; struct ncclIntruQueue asyncCollJobs; ncclIntruQueueConstruct(&asyncCollJobs); @@ -454,7 +539,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS); ncclIntruQueueEnqueue(&asyncCollJobs, &job->base); } - comm = comm->groupNext; + comm = comm->groupNext[ncclGroupTaskTypeCollective]; } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0); // connect NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail); @@ -466,42 +551,49 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf } while (cliqueHead != nullptr); // done with all buffer allocation, start registration and enqueue - comm = groupCommHeadMain; + comm = groupCommHeadMain[ncclGroupTaskTypeCollective]; do { CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail); NCCLCHECKGOTO(ncclTasksRegAndEnqueue(comm), ret, fail); - comm = comm->groupNext; + comm = comm->groupNext[ncclGroupTaskTypeCollective]; } while (comm); } - if ((!simInfo) && (groupCommHeadMain != nullptr)) { - NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail); + if ((!simInfo) && (groupCommHeadMain[ncclGroupTaskTypeCollective] != nullptr)) { + NCCLCHECKGOTO(doLaunches(groupCommHeadMain[ncclGroupTaskTypeCollective]), ret, fail); } while (!ncclIntruQueueEmpty(asyncJobsMain)) { struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain); - if (!job->destroyFlag && job->comm && !job->comm->config.blocking) + if (!job->destroyFlag && job->comm && !job->comm->config.blocking && groupCommHeadMain[ncclGroupTaskTypeCollective] == nullptr) (void) ncclCommSetAsyncError(job->comm, ret); if (job->destructor) job->destructor((void*)job); } - while (groupCommHeadMain != nullptr) { - struct ncclComm* comm = groupCommHeadMain; - struct ncclComm* next = comm->groupNext; - // Poll for callbacks sent to us from other threads. Typically these free - // resources from to our memory pools and UB - NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/false), ret, fail); - (void) ncclGroupCommLeave(comm); - if (!comm->config.blocking) { - (void) ncclCommSetAsyncError(comm, ret); + for (int type = 0; type < ncclGroupTaskTypeNum; ++type) { + while (groupCommHeadMain[type] != nullptr) { + struct ncclComm* comm = groupCommHeadMain[type]; + struct ncclComm* next = comm->groupNext[type]; + // Poll for callbacks sent to us from other threads. Typically these free + // resources from to our memory pools and UB + if (comm->reclaimSteps == GROUP_MAX_RECLAIM_STEPS) { + NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/false), ret, fail); + comm->reclaimSteps = 0; + } else { + comm->reclaimSteps++; + } + (void)ncclGroupCommLeave(comm, type); + if (!comm->config.blocking) { + (void)ncclCommSetAsyncError(comm, ret); + } + groupCommHeadMain[type] = next; } - groupCommHeadMain = next; } exit: return ret; fail: - groupCleanup(gjob->groupCommHeadPtr, gjob->groupCommPreconnectHeadPtr, gjob->asyncJobsPtr, gjob->groupErrorPtr, gjob->groupBlockingPtr, gjob->abortFlagPtr, ret); + groupCleanup(gjob->groupCommHead, &gjob->asyncJobs, ret); goto exit; } @@ -514,6 +606,8 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) { ncclSimInfo_t internalSimInfo = NCCL_SIM_INFO_INITIALIZER; ncclSimInfo_t* internalSimInfoPtr = NULL; size_t realSize = 0; + bool hasCommHead = false; + ncclGroupJob* groupJob = NULL; internalSimInfo.magic = 0; @@ -539,72 +633,108 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) { internalSimInfoPtr = &internalSimInfo; } - if (ncclGroupCommHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs) || ncclGroupCommPreconnectHead != nullptr) { - ncclGroupJobMain.groupCommHeadPtr = &ncclGroupCommHead; - ncclGroupJobMain.groupCommPreconnectHeadPtr = &ncclGroupCommPreconnectHead; - ncclGroupJobMain.groupErrorPtr = &ncclGroupError; - ncclGroupJobMain.asyncJobsPtr = &ncclAsyncJobs; - ncclGroupJobMain.abortFlagPtr = &ncclGroupJobAbortFlag; - ncclGroupJobMain.groupBlockingPtr = &ncclGroupBlocking; - ncclGroupJobMain.initialized = true; - ncclGroupJobMainPtr = &ncclGroupJobMain; + for (int type = 0; type < ncclGroupTaskTypeNum; ++type) { + if (ncclGroupCommHead[type]) { + hasCommHead = true; + break; + } + } + + NCCLCHECKGOTO(ncclCalloc(&groupJob, 1), ret, fail); + ncclIntruQueueConstruct(&groupJob->asyncJobs); + groupJob->groupRefCount = 0; + groupJob->nonBlockingInit = false; + memcpy(groupJob->groupCommHead, ncclGroupCommHead, sizeof(ncclGroupCommHead)); + groupJob->groupCommPreconnectHead = ncclGroupCommPreconnectHead; + groupJob->groupError = ncclSuccess; + groupJob->abortFlag = false; + groupJob->joined = false; + ncclIntruQueueTransfer(&groupJob->asyncJobs, &ncclAsyncJobs); + + if (hasCommHead || !ncclIntruQueueEmpty(&groupJob->asyncJobs) || ncclGroupCommPreconnectHead != nullptr) { /* make sure ncclGroupBlocking has been set. */ assert(ncclGroupBlocking == 0 || ncclGroupBlocking == 1); if (ncclGroupBlocking == 0) { /* nonblocking group */ - if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) { - ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs); + if (!ncclIntruQueueEmpty(&groupJob->asyncJobs)) { + ncclAsyncJob* job = ncclIntruQueueHead(&groupJob->asyncJobs); do { NCCLCHECKGOTO(ncclCommSetAsyncError(job->comm, ncclInProgress), ret, fail); - job->comm->groupJob = ncclGroupJobMainPtr; + if (job->comm->groupJob == NULL) { + job->comm->groupJob = groupJob; + groupJob->groupRefCount++; + } job = job->next; } while (job); } - if (ncclGroupCommHead) { - ncclComm_t comm = ncclGroupCommHead; - do { - NCCLCHECKGOTO(ncclCommSetAsyncError(comm, ncclInProgress), ret, fail); - /* link group job to communicators. */ - comm->groupJob = ncclGroupJobMainPtr; - comm = comm->groupNext; - } while (comm); + for (int type = 0; type < ncclGroupTaskTypeNum; ++type) { + if (ncclGroupCommHead[type]) { + ncclComm_t comm = ncclGroupCommHead[type]; + do { + NCCLCHECKGOTO(ncclCommSetAsyncError(comm, ncclInProgress), ret, fail); + /* link group job to communicators. */ + if (comm->groupJob == NULL) { + comm->groupJob = groupJob; + groupJob->groupRefCount++; + } + comm = comm->groupNext[type]; + } while (comm); + } } - ncclGroupJobMainPtr->base.func = groupLaunchNonBlocking; - PTHREADCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), "pthread_create", ret, fail); + groupJob->base.func = groupLaunchNonBlocking; + PTHREADCHECKGOTO(pthread_create(&groupJob->base.thread, NULL, ncclAsyncJobMain, (void*)&groupJob->base), "pthread_create", ret, fail); + groupJob->nonBlockingInit = true; ret = ncclInProgress; } else { /* blocking group */ int savedDev; CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail); - NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base, internalSimInfoPtr), ret, fail); + NCCLCHECKGOTO(groupLaunch(&groupJob->base, internalSimInfoPtr), ret, fail); CUDACHECKGOTO(cudaSetDevice(savedDev), ret, fail); if (simInfo) memcpy((void*)simInfo, (void*)internalSimInfoPtr, realSize); - groupResetJobState(ncclGroupJobMainPtr); + free(groupJob); } } + /* Reset the job state for the next group call. */ + groupLocalResetJobState(); exit: return ret; fail: - groupCleanup(&ncclGroupCommHead, &ncclGroupCommPreconnectHead, &ncclAsyncJobs, &ncclGroupError, &ncclGroupBlocking, &ncclGroupJobAbortFlag, ret); + if (groupJob) { + groupCleanup(groupJob->groupCommHead, &groupJob->asyncJobs, ret); + free(groupJob); + } else { + groupCleanup(ncclGroupCommHead, &ncclAsyncJobs, ret); + } + groupLocalResetJobState(); goto exit; } ncclResult_t ncclGroupJobComplete(struct ncclGroupJob* groupJob) { ncclResult_t ret = ncclSuccess; - if (groupJob && groupJob->initialized) { - ret = ncclAsyncJobComplete(&groupJob->base); - groupResetJobState(groupJob); + if (groupJob && groupJob->nonBlockingInit) { + if (!__atomic_exchange_n(&groupJob->joined, true, __ATOMIC_ACQ_REL)) { + ret = ncclAsyncJobComplete(&groupJob->base); + } + if (ncclAtomicRefCountDecrement(&groupJob->groupRefCount) == 0) { + free(groupJob); + } } return ret; } ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob) { - if (groupJob && groupJob->initialized) { - __atomic_store_n(groupJob->abortFlagPtr, true, __ATOMIC_RELEASE); - NCCLCHECK(ncclGroupJobComplete(groupJob)); + if (groupJob && groupJob->nonBlockingInit) { + if (!__atomic_exchange_n(&groupJob->joined, true, __ATOMIC_ACQ_REL)) { + __atomic_store_n(&groupJob->abortFlag, true, __ATOMIC_RELAXED); + ncclAsyncJobComplete(&groupJob->base); + } + if (ncclAtomicRefCountDecrement(&groupJob->groupRefCount) == 0) { + free(groupJob); + } } return ncclSuccess; } diff --git a/src/include/allocator.h b/src/include/allocator.h new file mode 100644 index 000000000..189c3d4e2 --- /dev/null +++ b/src/include/allocator.h @@ -0,0 +1,13 @@ +/************************************************************************* + * Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_ALLOCATOR_H_ +#define NCCL_ALLOCATOR_H_ + +ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr); +ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr); + +#endif diff --git a/src/include/bitops.h b/src/include/bitops.h index dcf0e2e09..71053ed49 100644 --- a/src/include/bitops.h +++ b/src/include/bitops.h @@ -19,6 +19,28 @@ #endif #endif +template +constexpr static __host__ __device__ Int minval(Int a) { return a; } +template +constexpr static __host__ __device__ Int minval(Int a, Int b, More ...more) { + #if __CUDA_ARCH__ + return minval(min(a, b), more...); + #else + return minval(a < b ? a : b, more...); + #endif +} + +template +constexpr static __host__ __device__ Int maxval(Int a) { return a; } +template +constexpr static __host__ __device__ Int maxval(Int a, Int b, More ...more) { + #if __CUDA_ARCH__ + return maxval(max(a, b), more...); + #else + return maxval(a > b ? a : b, more...); + #endif +} + #define DIVUP(x, y) \ (((x)+(y)-1)/(y)) @@ -32,32 +54,150 @@ size = ((size + (align) - 1) / (align)) * (align); template -__host__ __device__ constexpr Z divUp(X x, Y y) { +static __host__ __device__ constexpr Z divUp(X x, Y y) { return (x+y-1)/y; } template -__host__ __device__ constexpr Z roundUp(X x, Y y) { +static __host__ __device__ constexpr Z roundUp(X x, Y y) { return (x+y-1) - (x+y-1)%y; } template -__host__ __device__ constexpr Z roundDown(X x, Y y) { +static __host__ __device__ constexpr Z roundDown(X x, Y y) { return x - x%y; } // assumes second argument is a power of 2 template -__host__ __device__ constexpr Z alignUp(X x, int a) { +static __host__ __device__ constexpr Z alignUp(X x, int a) { return (x + a-1) & Z(-a); } // assumes second argument is a power of 2 template -__host__ __device__ constexpr Z alignDown(X x, int a) { +static __host__ __device__ constexpr Z alignDown(X x, int a) { return x & Z(-a); } template -inline __host__ __device__ int countOneBits(Int x) { +constexpr __host__ __device__ bool isPow2(Int x) { + return (x & (x-1)) == 0; +} + +template +static __host__ __device__ T add4G(T base, int delta4G) { + union { T tmp; uint32_t u32[2]; }; + tmp = base; + u32[1] += delta4G; + return tmp; +} + +template +static __host__ __device__ T incWrap4G(T ptr, uint32_t delta4G, uint32_t lo4G, uint32_t hi4G) { + union { T tmp; uint32_t u32[2]; }; + tmp = ptr; + u32[1] += delta4G; + if (u32[1] >= hi4G) u32[1] -= hi4G-lo4G; + return tmp; +} + +template +static __host__ __device__ T decWrap4G(T ptr, uint32_t delta4G, uint32_t lo4G, uint32_t hi4G) { + union { T tmp; uint32_t u32[2]; }; + tmp = ptr; + u32[1] -= delta4G; + if (u32[1] < lo4G) u32[1] += hi4G-lo4G; + return tmp; +} + +// Produce the reciprocal of x for use in idivByRcp +constexpr __host__ __device__ uint32_t idivRcp32(uint32_t x) { + return uint32_t(uint64_t(0x100000000)/x); +} +constexpr __host__ __device__ uint64_t idivRcp64(uint64_t x) { + return uint64_t(-1)/x + isPow2(x); +} + +static __host__ __device__ uint32_t mul32hi(uint32_t a, uint32_t b) { +#if __CUDA_ARCH__ + return __umulhi(a, b); +#else + return uint64_t(a)*b >> 32; +#endif +} +static __host__ __device__ uint64_t mul64hi(uint64_t a, uint64_t b) { +#if __CUDA_ARCH__ + return __umul64hi(a, b); +#else + return (uint64_t)(((unsigned __int128)a)*b >> 64); +#endif +} + +// Produce the reciprocal of x*y given their respective reciprocals. This incurs +// no integer division on device. +static __host__ __device__ uint32_t imulRcp32(uint32_t x, uint32_t xrcp, uint32_t y, uint32_t yrcp) { + if (xrcp == 0) return yrcp; + if (yrcp == 0) return xrcp; + uint32_t rcp = mul32hi(xrcp, yrcp); + uint32_t rem = -x*y*rcp; + if (x*y <= rem) rcp += 1; + return rcp; +} +static __host__ __device__ uint64_t imulRcp64(uint64_t x, uint64_t xrcp, uint64_t y, uint64_t yrcp) { + if (xrcp == 0) return yrcp; + if (yrcp == 0) return xrcp; + uint64_t rcp = mul64hi(xrcp, yrcp); + uint64_t rem = -x*y*rcp; + if (x*y <= rem) rcp += 1; + return rcp; +} + +// Fast integer division where divisor has precomputed reciprocal. +// idivFast(x, y, idivRcp(y)) == x/y +static __host__ __device__ void idivmodFast32(uint32_t *quo, uint32_t *rem, uint32_t x, uint32_t y, uint32_t yrcp) { + uint32_t q = x, r = 0; + if (yrcp != 0) { + q = mul32hi(x, yrcp); + r = x - y*q; + if (r >= y) { q += 1; r -= y; } + } + *quo = q; + *rem = r; +} +static __host__ __device__ void idivmodFast64(uint64_t *quo, uint64_t *rem, uint64_t x, uint64_t y, uint64_t yrcp) { + uint64_t q = x, r = 0; + if (yrcp != 0) { + q = mul64hi(x, yrcp); + r = x - y*q; + if (r >= y) { q += 1; r -= y; } + } + *quo = q; + *rem = r; +} + +static __host__ __device__ uint32_t idivFast32(uint32_t x, uint32_t y, uint32_t yrcp) { + uint32_t q, r; + idivmodFast32(&q, &r, x, y, yrcp); + return q; +} +static __host__ __device__ uint32_t idivFast64(uint64_t x, uint64_t y, uint64_t yrcp) { + uint64_t q, r; + idivmodFast64(&q, &r, x, y, yrcp); + return q; +} + +static __host__ __device__ uint32_t imodFast32(uint32_t x, uint32_t y, uint32_t yrcp) { + uint32_t q, r; + idivmodFast32(&q, &r, x, y, yrcp); + return r; +} +static __host__ __device__ uint32_t imodFast64(uint64_t x, uint64_t y, uint64_t yrcp) { + uint64_t q, r; + idivmodFast64(&q, &r, x, y, yrcp); + return r; +} + +template +static __host__ __device__ int countOneBits(Int x) { #if __CUDA_ARCH__ if (sizeof(Int) <= sizeof(unsigned int)) { return __popc((unsigned int)x); @@ -83,7 +223,7 @@ inline __host__ __device__ int countOneBits(Int x) { // Returns index of first one bit or returns -1 if mask is zero. template -inline __host__ __device__ int firstOneBit(Int mask) { +static __host__ __device__ int firstOneBit(Int mask) { int i; #if __CUDA_ARCH__ if (sizeof(Int) <= sizeof(int)) { @@ -108,14 +248,14 @@ inline __host__ __device__ int firstOneBit(Int mask) { } template -inline __host__ __device__ int popFirstOneBit(Int* mask) { +static __host__ __device__ int popFirstOneBit(Int* mask) { Int tmp = *mask; *mask &= *mask-1; return firstOneBit(tmp); } template -inline __host__ __device__ int log2Down(Int x) { +static __host__ __device__ int log2Down(Int x) { int w, n; #if __CUDA_ARCH__ if (sizeof(Int) <= sizeof(int)) { @@ -147,7 +287,7 @@ inline __host__ __device__ int log2Down(Int x) { } template -inline __host__ __device__ int log2Up(Int x) { +static __host__ __device__ int log2Up(Int x) { int w, n; if (x != 0) x -= 1; #if __CUDA_ARCH__ @@ -180,19 +320,19 @@ inline __host__ __device__ int log2Up(Int x) { } template -inline __host__ __device__ Int pow2Up(Int x) { +static __host__ __device__ Int pow2Up(Int x) { return Int(1)< -inline __host__ __device__ Int pow2Down(Int x) { +static __host__ __device__ Int pow2Down(Int x) { // True, log2Down can return -1, but we don't normally pass 0 as an argument... // coverity[negative_shift] return Int(1)< -inline __host__ UInt reverseSubBits(UInt x) { +static __host__ UInt reverseSubBits(UInt x) { if (nSubBits >= 16 && 8*sizeof(UInt) == nSubBits) { switch (8*sizeof(UInt)) { case 16: x = __builtin_bswap16(x); break; @@ -225,7 +365,7 @@ template<> struct ncclToUnsigned { using type = unsigned lon // Reverse the bottom nBits bits of x. The top bits will be overwritten with 0's. template -inline __host__ __device__ Int reverseBits(Int x, int nBits) { +static __host__ __device__ Int reverseBits(Int x, int nBits) { using UInt = typename ncclToUnsigned::type; union { UInt ux; Int sx; }; sx = x; @@ -249,7 +389,7 @@ inline __host__ __device__ Int reverseBits(Int x, int nBits) { // has nearly the full range of uint32_t except it only keeps the top 3 bits // beneath the leading 1 bit and thus has a max value of 0xf0000000. -inline __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) { +static __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) { int log2x; #if __CUDA_ARCH__ log2x = 31-__clz(x|1); @@ -261,7 +401,7 @@ inline __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) { return exponent<>bitsPerPow2; uint32_t mantissa = (x & ((1u< -inline __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) { +static __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) { eatHash(acc, (const void*)bytes, sizeof(T)); } -inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) { +static __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) { uint64_t h = acc[0]; h ^= h >> 31; h *= 0xbac3bd562846de6b; @@ -316,13 +456,13 @@ inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) { return h; } -inline __host__ __device__ uint64_t getHash(const void* bytes, size_t size) { +static __host__ __device__ uint64_t getHash(const void* bytes, size_t size) { uint64_t acc[2] = {1, 1}; eatHash(acc, bytes, size); return digestHash(acc); } template -inline __host__ __device__ uint64_t getHash(const T* bytes) { +static __host__ __device__ uint64_t getHash(const T* bytes) { return getHash((const void*)bytes, sizeof(T)); } diff --git a/src/include/comm.h b/src/include/comm.h index 409518713..1378e0765 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -17,6 +17,7 @@ #include "register.h" #include "graph.h" #include "profiler.h" +#include "allocator.h" #if CUDART_VERSION < 9000 struct cudaLaunchParams { @@ -131,7 +132,6 @@ struct ncclSharedResources { int* tpRankToLocalRank; // Internal streams struct ncclStrongStream deviceStream, hostStream; - int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream int persistentRefs; cudaEvent_t launchEvent, scratchEvent; @@ -218,6 +218,7 @@ struct ncclTaskColl { // Profiler plugin int eActivationMask; void* eventHandle; + uint8_t nChannels; }; struct ncclTaskP2p { struct ncclTaskP2p* next; @@ -231,6 +232,7 @@ struct ncclTaskP2p { // Profiler plugin int eActivationMask; void* eventHandle; + uint8_t nChannels; }; struct ncclKernelPlan { @@ -243,10 +245,14 @@ struct ncclKernelPlan { bool persistent; // aka captured in a graph bool isHostCbEnq; + bool isSymColl; enum ncclDevWorkStorageType workStorageType; bool kernelSpecialized; - void *kernelFn; - struct ncclDevKernelArgs* kernelArgs; + void* kernelFn; + union { + struct ncclDevKernelArgs* kernelArgs; + struct ncclSymDevArgs* kernelSymArgs; + }; size_t kernelArgsSize; uint64_t channelMask; // bitset of which channels are present bool hasProxyOps; // does any channel have a non-empty proxyOpQueue @@ -355,6 +361,7 @@ struct ncclKernelPlanner { struct Peer* peers/*[nRanks]*/; int nTasksColl, nTasksP2p; bool persistent; + bool isSymColl; // The list of user streams aggregated over all tasks present. struct ncclCudaStreamList* streams; @@ -404,6 +411,12 @@ struct ncclKernelPlanner { #define NCCL_MAGIC 0x0280028002800280 // Nickel atomic number is 28. +typedef enum ncclGroupTaskType { + ncclGroupTaskTypeCollective = 0, + ncclGroupTaskTypeSymRegister = 1, + ncclGroupTaskTypeNum = 2, +} ncclGroupTaskType_t; + struct ncclComm { uint64_t startMagic; struct ncclMemoryStack memPermanent, memScoped; @@ -420,9 +433,10 @@ struct ncclComm { struct ncclTopoSystem* topo; struct ncclProxyConnector* gproxyConn; struct ncclIntruQueue legacyRegCleanupQueue; + bool peerInfoValid; - int netPluginLoaded; ncclNet_t* ncclNet; + int netPluginIndex; int ncclNetVer; ncclNetDeviceType netDeviceType; ncclCollNet_t* ncclCollNet; @@ -439,7 +453,6 @@ struct ncclComm { uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches. - const char* commName; uint64_t commHash; int rank; // my rank in the communicator int nRanks; // number of GPUs in communicator @@ -515,6 +528,7 @@ struct ncclComm { // Device side of the communicator (for cudaFree's) struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm + struct ncclSymDevComm symDevComm; uint32_t workArgsBytes; // max size of kernel args uint32_t workFifoBytes; // size of workFifoBuf, power of 2 @@ -522,12 +536,10 @@ struct ncclComm { void* workFifoBufDev; void* workFifoBufGdrHandle; - // Monotonic number of bytes (mod 1<<32) consumed per channel. In cudaHost memory. - uint32_t* workFifoConsumed/*[MAXCHANNELS]*/; - // Last observed value of: min(workFifoConsumed[c] for c < MAXCHANNELS) - uint32_t workFifoConsumedLeast; // Monotonic number of bytes (mod 1<<32) sent to fifo. uint32_t workFifoProduced; + uint32_t workFifoProducedLastRecorded; + uint32_t workFifoConsumed; // Intra-process sync struct ncclComm* intraComm0; // leader of intra-process comms (self possible) @@ -543,10 +555,8 @@ struct ncclComm { struct ncclProxyState* proxyState; int proxyRefCountOld; /* store proxy post-atomic-sub refcount */ // Whether this communicator uses collNet - int collNetSupport; bool isOneRPN; uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes]; - bool intraNodeP2pSupport; int* collNetHeads; int collNetHeadsNum; int* collNetDenseToUserRank; @@ -568,7 +578,7 @@ struct ncclComm { // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when // this comm is not yet in a group. - struct ncclComm* groupNext; + struct ncclComm* groupNext[ncclGroupTaskTypeNum]; // Subset of those in groupNext list. Holds 0x1 if not needing preconnect. struct ncclComm* preconnectNext; int localPersistentRefs; // number of persistent plan-lists capturing this comm @@ -588,6 +598,7 @@ struct ncclComm { ncclUserRedOp *userRedOps; // Queue of things for the main thread to do + int reclaimSteps; struct ncclIntruQueueMpsc callbackQueue; ncclConfig_t config; @@ -600,6 +611,9 @@ struct ncclComm { // group job to support multi-thread FT struct ncclGroupJob *groupJob; + // Flag indicating if this communicator shares resources with parent or children + bool shareResources; + // Tuning plugin int tunerPluginLoaded; ncclTuner_t* tuner; @@ -613,9 +627,18 @@ struct ncclComm { // buffer registration cache struct ncclRegCache regCache; int isAllNvlink; + bool isAllDirectP2p; + int symmetricSupport; bool useNetPXN; bool useGdr; int splitCount; + // symmetric buffer + uint8_t* baseUCSymPtr; + uint8_t* baseMCSymPtr; + size_t baseStride; + size_t symAllocHead; + CUmemGenericAllocationHandle symMCHandle; + struct ncclIntruQueue symRegTaskQueue; uint64_t endMagic; }; @@ -647,15 +670,21 @@ inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome) return ncclSuccess; } -inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm) { +inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm, bool waitSome) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); while (true) { struct ncclCommEventCallback* cb = ncclIntruQueueHead(&comm->eventCallbackQueue); if (cb == nullptr) break; - cudaError_t ok = cudaEventSynchronize(cb->event); - if (ok == cudaErrorNotReady) break; + cudaError_t ok; + if (waitSome) { + ok = cudaEventSynchronize(cb->event); + waitSome = false; + } else { + ok = cudaEventQuery(cb->event); + if (ok == cudaErrorNotReady) break; + } ncclIntruQueueDequeue(&comm->eventCallbackQueue); if (ok == cudaSuccess) { NCCLCHECKGOTO(cb->fn(comm, cb), result, finish); diff --git a/src/include/cpuset.h b/src/include/cpuset.h index ec55cbc54..99e3edf4d 100644 --- a/src/include/cpuset.h +++ b/src/include/cpuset.h @@ -58,4 +58,29 @@ static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) { return ncclSuccess; } +static char* ncclCpusetToRangeStr(cpu_set_t* mask, char* str, size_t len) { + int c = 0; + int start = -1; + // Iterate through all possible CPU bits plus one extra position + for (int cpu = 0; cpu <= CPU_SETSIZE; cpu++) { + int isSet = (cpu == CPU_SETSIZE) ? 0 : CPU_ISSET(cpu, mask); + // Start of a new range + if (isSet && start == -1) { + start = cpu; + } + // End of a range, add comma between ranges + if (!isSet && start != -1) { + if (cpu-1 == start) { + c += snprintf(str+c, len-c, "%s%d", c ? "," : "", start); + } else { + c += snprintf(str+c, len-c, "%s%d-%d", c ? "," : "", start, cpu-1); + } + if (c >= len-1) break; + start = -1; + } + } + if (c == 0) str[0] = '\0'; + return str; +} + #endif diff --git a/src/include/cudawrap.h b/src/include/cudawrap.h index bf6132657..2edc60f21 100644 --- a/src/include/cudawrap.h +++ b/src/include/cudawrap.h @@ -36,6 +36,10 @@ extern CUmemAllocationHandleType ncclCuMemHandleType; } \ } while(false) +#define CUCALL(cmd) do { \ + pfn_##cmd; \ +} while(false) + #define CUCHECKGOTO(cmd, res, label) do { \ CUresult err = pfn_##cmd; \ if( err != CUDA_SUCCESS ) { \ @@ -66,49 +70,49 @@ extern CUmemAllocationHandleType ncclCuMemHandleType; } \ } while(0) -#define DECLARE_CUDA_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol +#define DECLARE_CUDA_PFN_EXTERN(symbol,version) extern PFN_##symbol##_v##version pfn_##symbol #if CUDART_VERSION >= 11030 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */ -DECLARE_CUDA_PFN_EXTERN(cuDeviceGet); -DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute); -DECLARE_CUDA_PFN_EXTERN(cuGetErrorString); -DECLARE_CUDA_PFN_EXTERN(cuGetErrorName); -DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange); -DECLARE_CUDA_PFN_EXTERN(cuCtxCreate); -DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy); -DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent); -DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent); -DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice); -DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute); -DECLARE_CUDA_PFN_EXTERN(cuLaunchKernel); +DECLARE_CUDA_PFN_EXTERN(cuDeviceGet, 2000); +DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute, 2000); +DECLARE_CUDA_PFN_EXTERN(cuGetErrorString, 6000); +DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000); +DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020); +DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 11040); +DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000); +DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000); +DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000); +DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000); +DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute, 4000); +DECLARE_CUDA_PFN_EXTERN(cuLaunchKernel, 4000); #if CUDART_VERSION >= 11080 -DECLARE_CUDA_PFN_EXTERN(cuLaunchKernelEx); +DECLARE_CUDA_PFN_EXTERN(cuLaunchKernelEx, 11060); #endif // cuMem API support -DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve); -DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree); -DECLARE_CUDA_PFN_EXTERN(cuMemCreate); -DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity); -DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle); -DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle); -DECLARE_CUDA_PFN_EXTERN(cuMemMap); -DECLARE_CUDA_PFN_EXTERN(cuMemRelease); -DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle); -DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess); -DECLARE_CUDA_PFN_EXTERN(cuMemUnmap); -DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationPropertiesFromHandle); +DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020); +DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020); +DECLARE_CUDA_PFN_EXTERN(cuMemCreate, 10020); +DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity, 10020); +DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020); +DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020); +DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020); +DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020); +DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000); +DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020); +DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020); +DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationPropertiesFromHandle, 10020); #if CUDA_VERSION >= 11070 -DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support +DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support #endif #if CUDA_VERSION >= 12010 /* NVSwitch Multicast support */ -DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice); -DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem); -DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr); -DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate); -DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity); -DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind); +DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice, 12010); +DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem, 12010); +DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr, 12010); +DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010); +DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010); +DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010); #endif #endif diff --git a/src/include/device.h b/src/include/device.h index f6ca51b75..2c5ce1029 100644 --- a/src/include/device.h +++ b/src/include/device.h @@ -10,6 +10,7 @@ #include "nccl.h" #include "nccl_common.h" #include "bitops.h" +#include "symmetric.h" #include #include #include @@ -29,6 +30,30 @@ extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS]; #define NCCL_CUDA_ARCH 0 #endif +#ifdef __CUDA_ARCH_SPECIFIC__ + #define NCCL_CUDA_ARCH_SPECIFIC __CUDA_ARCH_SPECIFIC__ +#elif defined(__CUDA_ARCH_HAS_FEATURE__) + #if __CUDA_ARCH_HAS_FEATURE__(SM90_ALL) + #define NCCL_CUDA_ARCH_SPECIFIC 900 + #elif __CUDA_ARCH_HAS_FEATURE__(SM100_ALL) + #define NCCL_CUDA_ARCH_SPECIFIC 1000 + #elif __CUDA_ARCH_HAS_FEATURE__(SM101_ALL) + #define NCCL_CUDA_ARCH_SPECIFIC 1010 + #elif __CUDA_ARCH_HAS_FEATURE__(SM120_ALL) + #define NCCL_CUDA_ARCH_SPECIFIC 1200 + #else + #define NCCL_CUDA_ARCH_SPECIFIC 0 + #endif +#else + #define NCCL_CUDA_ARCH_SPECIFIC 0 +#endif + +#ifdef __CUDA_ARCH_FAMILY_SPECIFIC__ + #define NCCL_CUDA_ARCH_FAMILY_SPECIFIC __CUDA_ARCH_FAMILY_SPECIFIC__ +#else + #define NCCL_CUDA_ARCH_FAMILY_SPECIFIC 0 +#endif + #include "net_device.h" enum ncclDevRedOp_t { @@ -380,6 +405,14 @@ struct alignas(16) ncclDevChannel { uint64_t workCounter; }; +#define MAX_PROFILER_EVENTS_PER_CHANNEL 64 +struct ncclDevProfiler { + struct { + uint64_t counter; + uint64_t timestamp; + } data[MAX_PROFILER_EVENTS_PER_CHANNEL]; +}; + struct ncclDevComm { int rank; int nRanks; @@ -389,9 +422,6 @@ struct ncclDevComm { int p2pChunkSize; int isAllNvlink; - // Work fifo return credits - uint32_t* workConsumed/*[MAXCHANNELS]*/; - int* collNetDenseToUserRank; // Flag to ask NCCL kernels to abort @@ -402,8 +432,8 @@ struct ncclDevComm { int* rankToLocalRank; // Profiler counters - uint64_t* workStarted/*[MAXCHANNELS]*/; - uint64_t* workCompleted/*[MAXCHANNELS]*/; + struct ncclDevProfiler* workStarted/*[MAXCHANNELS]*/; + struct ncclDevProfiler* workCompleted/*[MAXCHANNELS]*/; }; struct alignas(16) ncclDevCommAndChannels { @@ -476,7 +506,7 @@ __host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int __host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) { // Our collective unroll should move to the same bytes&insns model as NVLS. - return cudaArch >= 800 ? (cudaArch == 1200 ? 6 : 8) : 4; + return cudaArch >= 800 ? (cudaArch / 100 == 12 ? 6 : 8) : 4; } __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } @@ -507,7 +537,6 @@ extern int const ncclDevKernelCount; extern void* const ncclDevKernelList[/*ncclDevKernelCount*/]; // Table of most specialized kernel function to run given func index. -extern int const ncclDevFuncIdCount; extern int const ncclDevFuncRowToId[]; extern void* const ncclDevKernelForFunc[/*funcIndex*/]; extern bool const ncclDevKernelForFuncIsSpecialized[/*funcIndex*/]; @@ -535,11 +564,7 @@ inline bool ncclNvlsSupported(int devRedOp, int type) { // `ncclDevFuncIndex()` needs to be in sync with "all_functions()" in "src/device/generate.py" inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto) { - #if defined(__CUDA_BF16_TYPES_EXIST__) constexpr int NumTypes = ncclNumTypes; - #else - constexpr int NumTypes = ncclNumTypes + 1; - #endif int row; do { row = 0; // ncclDevFuncIndex_P2p @@ -564,7 +589,7 @@ inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto) } row += nAlgos*NCCL_NUM_PROTOCOLS; - nAlgos = 6; + nAlgos = 6; // TREE RING COLLNET_DIRECT COLLNET_CHAIN NVLS NVLS_TREE if (coll == ncclFuncAllReduce) { row += ((devRedOp*NumTypes + type)*nAlgos + algo)*NCCL_NUM_PROTOCOLS + proto; break; diff --git a/src/include/graph.h b/src/include/graph.h index a06556e37..7475e5a7b 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -50,6 +50,8 @@ int ncclPxnDisable(struct ncclComm* comm); ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks); ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu); +ncclResult_t ncclGetUserP2pLevel(int* level); + // Find CPU affinity ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity); @@ -74,7 +76,9 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex); ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count); -#define NCCL_TOPO_MAX_NODES 256 +// Allows for up to 32 NICs per node on GB200-NVL72 +#define NCCL_TOPO_MAX_NODES 576 +ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType); // Init search. Needs to be done before calling ncclTopoCompute ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system); diff --git a/src/include/group.h b/src/include/group.h index c06d1ef1b..033a187da 100644 --- a/src/include/group.h +++ b/src/include/group.h @@ -9,9 +9,11 @@ #include "nccl.h" #include "comm.h" +#include "allocator.h" +#include "register.h" ncclResult_t ncclGroupErrCheck(ncclResult_t ret); -void ncclGroupCommJoin(struct ncclComm* comm); +void ncclGroupCommJoin(struct ncclComm* comm, int type); void ncclGroupCommPreconnect(struct ncclComm* comm); ncclResult_t ncclGroupCommLeave(struct ncclComm* comm); ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob); @@ -52,13 +54,14 @@ ncclResult_t ncclAsyncLaunch( struct ncclGroupJob { struct ncclAsyncJob base; - struct ncclComm **groupCommHeadPtr; - struct ncclComm **groupCommPreconnectHeadPtr; - ncclResult_t *groupErrorPtr; - bool *abortFlagPtr; - int *groupBlockingPtr; - struct ncclIntruQueue *asyncJobsPtr; - bool initialized; + int groupRefCount; + bool nonBlockingInit; + bool joined; + struct ncclComm *groupCommHead[ncclGroupTaskTypeNum]; + struct ncclComm *groupCommPreconnectHead; + ncclResult_t groupError; + bool abortFlag; + struct ncclIntruQueue asyncJobs; }; ncclResult_t ncclGroupStartInternal(); @@ -69,27 +72,9 @@ ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job); extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting extern __thread ncclResult_t ncclGroupError; -extern __thread struct ncclComm* ncclGroupCommHead; +extern __thread struct ncclComm* ncclGroupCommHead[ncclGroupTaskTypeNum]; extern __thread struct ncclComm* ncclGroupCommPreconnectHead; extern __thread int ncclGroupBlocking; -extern __thread struct ncclGroupJob *ncclGroupJobMainPtr; -extern __thread struct ncclGroupJob ncclGroupJobMain; - -static inline void groupResetJobState() { - ncclGroupBlocking = -1; - ncclGroupJobMainPtr = NULL; - memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob)); - return; -} - -static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) { - ncclResult_t ret = ncclSuccess; - if (job) { - ret = ncclAsyncJobComplete(&job->base); - groupResetJobState(); - } - return ret; -} inline ncclResult_t ncclGroupStartInternal() { ncclGroupDepth++; @@ -104,31 +89,32 @@ inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) { } // Add comm to this thread's group -inline void ncclGroupCommJoin(struct ncclComm* comm) { - if (comm->groupNext == reinterpret_cast(0x1)) { +inline void ncclGroupCommJoin(struct ncclComm* comm, int type) { + if (comm->groupNext[type] == reinterpret_cast(0x1)) { // Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves // the users program order yet insures siblings occur consecutively. This // is required by doLaunches() in "group.cc". - struct ncclComm** pp = &ncclGroupCommHead; + struct ncclComm** pp = &ncclGroupCommHead[type]; while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0) - pp = &(*pp)->groupNext; + pp = &(*pp)->groupNext[type]; // didn't find its clique, we need to insert it with ascending order based on commHash if (*pp == nullptr) { - pp = &ncclGroupCommHead; - while (*pp != nullptr && (*pp)->commHash < comm->commHash) pp = &(*pp)->groupNext; + pp = &ncclGroupCommHead[type]; + while (*pp != nullptr && (*pp)->commHash < comm->commHash) pp = &(*pp)->groupNext[type]; } - comm->groupNext = *pp; + comm->groupNext[type] = *pp; *pp = comm; // Comms gets a new memory stack scope upon joining. Each task batched for // this comm is allocated there. ncclMemoryStackPush(&comm->memScoped); - // Initialize planner - ncclKernelPlanner::Peer* tmp = comm->planner.peers; - memset(&comm->planner, 0, sizeof(comm->planner)); - comm->planner.peers = tmp; + if (type == ncclGroupTaskTypeCollective) { + // Initialize planner + ncclKernelPlanner::Peer* tmp = comm->planner.peers; + memset(&comm->planner, 0, sizeof(comm->planner)); + comm->planner.peers = tmp; + } } - ncclGroupBlocking = comm->config.blocking; } @@ -141,8 +127,8 @@ inline void ncclGroupCommPreconnect(struct ncclComm* comm) { } // Comm has left group -inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm) { - comm->groupNext = reinterpret_cast(0x1); +inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm, int type) { + comm->groupNext[type] = reinterpret_cast(0x1); ncclMemoryStackPop(&comm->memScoped); return ncclSuccess; } diff --git a/src/include/mlx5/mlx5dvcore.h b/src/include/mlx5/mlx5dvcore.h new file mode 100644 index 000000000..9ec40c039 --- /dev/null +++ b/src/include/mlx5/mlx5dvcore.h @@ -0,0 +1,18 @@ +#ifndef NCCL_MLX5DV_CORE_H_ +#define NCCL_MLX5DV_CORE_H_ + +/* Basic MLX5 direct verbs structs. Needed to dynamically load MLX5 direct verbs functions without + * explicit including of MLX5 direct verbs header. + */ + +#include +#include +#include +#include +#include "ibvwrap.h" + +enum mlx5dv_reg_dmabuf_access { + MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT = (1<<0), +}; + +#endif // NCCL_MLX5DV_CORE_H_ diff --git a/src/include/mlx5/mlx5dvsymbols.h b/src/include/mlx5/mlx5dvsymbols.h new file mode 100644 index 000000000..fb08368e7 --- /dev/null +++ b/src/include/mlx5/mlx5dvsymbols.h @@ -0,0 +1,23 @@ +#ifndef NCCL_MLX5DV_SYMBOLS_H_ +#define NCCL_MLX5DV_SYMBOLS_H_ + +#ifdef NCCL_BUILD_MLX5DV +#include +#else +#include "mlx5/mlx5dvcore.h" +#endif + +#include "nccl.h" + +/* MLX5 Direct Verbs Function Pointers*/ +struct ncclMlx5dvSymbols { + bool (*mlx5dv_internal_is_supported)(struct ibv_device *device); + int (*mlx5dv_internal_get_data_direct_sysfs_path)(struct ibv_context *context, char *buf, size_t buf_len); + /* DMA-BUF support */ + struct ibv_mr * (*mlx5dv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access); + }; + +/* Constructs MLX5 direct verbs symbols per rdma-core linking or dynamic loading mode */ +ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols); + +#endif // NCCL_MLX5DV_SYMBOLS_H_ diff --git a/src/include/mlx5/mlx5dvwrap.h b/src/include/mlx5/mlx5dvwrap.h new file mode 100644 index 000000000..4f858f3c6 --- /dev/null +++ b/src/include/mlx5/mlx5dvwrap.h @@ -0,0 +1,41 @@ +/************************************************************************* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2004, 2011-2012 Intel Corporation. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_MLX5DVWRAP_H_ +#define NCCL_MLX5DVWRAP_H_ + +#include +#include +#ifdef NCCL_BUILD_MLX5DV +#include +#else +#include "mlx5/mlx5dvcore.h" +#endif + +#include "core.h" +#include "ibvwrap.h" +#include +#include + +typedef enum mlx5dv_return_enum +{ + MLX5DV_SUCCESS = 0, //!< The operation was successful +} mlx5dv_return_t; + +ncclResult_t wrap_mlx5dv_symbols(void); +/* NCCL wrappers of MLX5 direct verbs functions */ +bool wrap_mlx5dv_is_supported(struct ibv_device *device); +ncclResult_t wrap_mlx5dv_get_data_direct_sysfs_path(struct ibv_context *context, char *buf, size_t buf_len); +/* DMA-BUF support */ +ncclResult_t wrap_mlx5dv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access); +struct ibv_mr * wrap_direct_mlx5dv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access); + +#endif // NCCL_MLX5DVWRAP_H_ diff --git a/src/include/nccl_common.h b/src/include/nccl_common.h index fcf2251fe..0f387c15e 100644 --- a/src/include/nccl_common.h +++ b/src/include/nccl_common.h @@ -7,6 +7,8 @@ #ifndef NCCL_DEBUG_H_ #define NCCL_DEBUG_H_ +#include + typedef enum { NCCL_LOG_NONE = 0, NCCL_LOG_VERSION = 1, @@ -38,6 +40,16 @@ typedef enum { typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); +// NCCL core profiler callback for network defined events instrumentation +enum { + ncclProfilerNetEventStart = 0, + ncclProfilerNetEventStop, + ncclProfilerNetEventUpdate, + ncclProfilerNetEventUpdateAndStop, +}; + +typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData); + #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now typedef enum { ncclFuncBroadcast = 0, @@ -51,7 +63,7 @@ typedef enum { ncclNumFuncs = 8 } ncclFunc_t; -#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet* +#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*/PAT #define NCCL_ALGO_UNDEF -1 #define NCCL_ALGO_TREE 0 #define NCCL_ALGO_RING 1 diff --git a/src/include/net.h b/src/include/net.h index afc2d160e..552e9bcb4 100644 --- a/src/include/net.h +++ b/src/include/net.h @@ -14,8 +14,6 @@ typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; -ncclResult_t ncclNetPluginLoad(struct ncclComm* comm); -ncclResult_t ncclNetPluginUnload(struct ncclComm* comm); ncclResult_t ncclNetInit(struct ncclComm* comm); ncclResult_t ncclNetFinalize(struct ncclComm* comm); diff --git a/src/include/nvtx.h b/src/include/nvtx.h index 2c18b36b9..de50dfe2e 100644 --- a/src/include/nvtx.h +++ b/src/include/nvtx.h @@ -31,10 +31,11 @@ #define NVTX_SID_CommInitRankScalable 12 // same schema as NVTX_SID_CommInitRank #define NVTX_SID_CommSplit 13 #define NVTX_SID_CommFinalize 14 +#define NVTX_SID_CommShrink 15 // When adding new schema IDs, DO NOT re-use/overlap with the enum schema ID below! // Define static schema ID for the reduction operation. -#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 15 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START +#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 16 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START extern const nvtxDomainHandle_t ncclNvtxDomainHandle; diff --git a/src/include/nvtx_payload_schemas.h b/src/include/nvtx_payload_schemas.h index 228a19275..89a41d4b5 100644 --- a/src/include/nvtx_payload_schemas.h +++ b/src/include/nvtx_payload_schemas.h @@ -67,6 +67,16 @@ NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommSplit, static cons ) ) +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommShrink, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, newcomm, TYPE_UINT64, nccl_nvtxCommStr), + (int, nranks, TYPE_INT, nccl_nvtxNranksStr), + (int, myrank, TYPE_INT, nccl_nvtxRankStr), + (int, cudaDev, TYPE_INT, nccl_nvtxCudaDevStr), + (int, num_exclude, TYPE_INT, "num_exclude") + ) +) + NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommFinalize, static constexpr, NCCL_NVTX_PAYLOAD_ENTRIES( (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr) diff --git a/src/include/plugin/nccl_net.h b/src/include/plugin/nccl_net.h index d57aad5a9..18d1486d7 100644 --- a/src/include/plugin/nccl_net.h +++ b/src/include/plugin/nccl_net.h @@ -28,10 +28,9 @@ #define NCCL_NET_MAX_REQUESTS 32 // Max number of ncclNet objects which can live in the same process -#define NCCL_NET_MAX_PLUGINS 3 - -// NCCL core profiler callback for network defined events instrumentation -typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData); +#ifndef NCCL_NET_MAX_PLUGINS +#define NCCL_NET_MAX_PLUGINS 16 +#endif #include "net/net_v10.h" #include "net/net_v9.h" diff --git a/src/include/plugin/nccl_profiler.h b/src/include/plugin/nccl_profiler.h index 34cf9a927..710aac4d5 100644 --- a/src/include/plugin/nccl_profiler.h +++ b/src/include/plugin/nccl_profiler.h @@ -19,43 +19,53 @@ enum { }; typedef enum { - ncclProfilerProxyOpSendPosted, - ncclProfilerProxyOpSendRemFifoWait, - ncclProfilerProxyOpSendTransmitted, - ncclProfilerProxyOpSendDone, - ncclProfilerProxyOpRecvPosted, - ncclProfilerProxyOpRecvReceived, - ncclProfilerProxyOpRecvTransmitted, - ncclProfilerProxyOpRecvDone, + ncclProfilerProxyOpSendPosted = 0, // deprecated in v4 + ncclProfilerProxyOpSendRemFifoWait = 1, // deprecated in v4 + ncclProfilerProxyOpSendTransmitted = 2, // deprecated in v4 + ncclProfilerProxyOpSendDone = 3, // deprecated in v4 + ncclProfilerProxyOpRecvPosted = 4, // deprecated in v4 + ncclProfilerProxyOpRecvReceived = 5, // deprecated in v4 + ncclProfilerProxyOpRecvTransmitted = 6, // deprecated in v4 + ncclProfilerProxyOpRecvDone = 7, // deprecated in v4 + ncclProfilerProxyOpInProgress_v4 = 19, /* Legacy proxy profiler states */ - ncclProfilerProxyStepSendGPUWait, - ncclProfilerProxyStepSendWait, - ncclProfilerProxyStepRecvWait, - ncclProfilerProxyStepRecvFlushWait, - ncclProfilerProxyStepRecvGPUWait, + ncclProfilerProxyStepSendGPUWait = 8, + ncclProfilerProxyStepSendPeerWait_v4 = 20, + ncclProfilerProxyStepSendWait = 9, + ncclProfilerProxyStepRecvWait = 10, + ncclProfilerProxyStepRecvFlushWait = 11, + ncclProfilerProxyStepRecvGPUWait = 12, /* Legacy proxy control states */ - ncclProfilerProxyCtrlIdle, - ncclProfilerProxyCtrlActive, - ncclProfilerProxyCtrlSleep, - ncclProfilerProxyCtrlWakeup, - ncclProfilerProxyCtrlAppend, - ncclProfilerProxyCtrlAppendEnd, + ncclProfilerProxyCtrlIdle = 13, + ncclProfilerProxyCtrlActive = 14, + ncclProfilerProxyCtrlSleep = 15, + ncclProfilerProxyCtrlWakeup = 16, + ncclProfilerProxyCtrlAppend = 17, + ncclProfilerProxyCtrlAppendEnd = 18, + + /* Network defined event states */ + ncclProfilerNetPluginUpdate = 21, + + /* Kernel event states */ + ncclProfilerKernelChStop = 22, } ncclProfilerEventState_t; typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t; typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t; typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t; +typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t; #include +#include "profiler/profiler_v4.h" #include "profiler/profiler_v3.h" #include "profiler/profiler_v2.h" #include "profiler/profiler_v1.h" -typedef ncclProfiler_v3_t ncclProfiler_t; -typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t; -typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t; +typedef ncclProfiler_v4_t ncclProfiler_t; +typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t; +typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t; #define NCCL_PROFILER_NET_VER_BITS (16) #define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS) diff --git a/src/include/plugin/profiler/profiler_v4.h b/src/include/plugin/profiler/profiler_v4.h new file mode 100644 index 000000000..157d8ddd5 --- /dev/null +++ b/src/include/plugin/profiler/profiler_v4.h @@ -0,0 +1,123 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_V4_H_ +#define PROFILER_V4_H_ + +typedef struct { + uint8_t type; // event type descriptor: ncclProfileColl, ... + void* parentObj; // pointer to the profiler parent object (for coll is the group) + int rank; // originating rank + union { + struct { + uint64_t seqNumber; + const char* func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + const char* datatype; + uint8_t nChannels; + uint8_t nWarps; + const char* algo; + const char* proto; + } coll; + + struct { + const char* func; + void* buff; + const char* datatype; + size_t count; + int peer; + uint8_t nChannels; + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + + struct { + uint8_t channelId; + uint64_t pTimer; // start timestamp from GPU globaltimer + } kernelCh; + + struct { + int64_t id; + void* data; + } netPlugin; + }; +} ncclProfilerEventDescr_v4_t; + +typedef union { + struct { + size_t transSize; + } proxyStep; + + struct { + int appendedProxyOps; + } proxyCtrl; + + struct { + void* data; + } netPlugin; + + struct { + uint64_t pTimer; + } kernelCh; +} ncclProfilerEventStateArgs_v4_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // - commName : user assigned communicator name + // - commHash : communicator id + // - nNodes : number of nodes in communicator + // - nranks : number of ranks in communicator + // - rank : rank identifier in communicator + // - logfn : logger function + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v4_t; + +#endif diff --git a/src/include/profiler.h b/src/include/profiler.h index bae0501bb..2fb6a7d38 100644 --- a/src/include/profiler.h +++ b/src/include/profiler.h @@ -21,8 +21,8 @@ struct ncclProxyConnector; struct ncclProfilerProxy { bool initialized; - uint64_t* workStarted/*[MAXCHANNELS]*/; - uint64_t* workCompleted/*[MAXCHANNELS]*/; + struct ncclDevProfiler* workStarted/*[MAXCHANNELS]*/; + struct ncclDevProfiler* workCompleted/*[MAXCHANNELS]*/; uint64_t workCounter[MAXCHANNELS]; // host work counter struct ncclProxyConnector sendProxyConn[MAXCHANNELS]; struct ncclProxyConnector recvProxyConn[MAXCHANNELS]; @@ -43,8 +43,7 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan); ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan); // Proxy Op Start/Stop Event Wrappers -ncclResult_t ncclProfilerStartSendProxyOpEvent(int sub, struct ncclProxyArgs* args); -ncclResult_t ncclProfilerStartRecvProxyOpEvent(int sub, struct ncclProxyArgs* args); +ncclResult_t ncclProfilerStartProxyOpEvent(int sub, struct ncclProxyArgs* args); ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args); // Proxy Step Start/Stop Event Wrappers @@ -57,11 +56,11 @@ ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHand ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle); // Kernel Channel Start/Stop Event Wrappers -ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s); -ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s); +ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t start); +ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t stop); // Record Event Wrappers -ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState); +ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, ncclProfilerEventState_t eState); ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState); ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState); diff --git a/src/include/proxy.h b/src/include/proxy.h index f90c80275..772aa206c 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -105,6 +105,13 @@ struct ncclProxyOp { struct ncclProxyOp *enqNext; }; +struct ncclProxySubArgs; + +struct ncclProxyEventHandle { + void* stepEventHandle; + struct ncclProxySubArgs* subArgPtr; +}; + struct ncclProxySubArgs { struct ncclProxyConnection* connection; int reg; @@ -137,13 +144,12 @@ struct ncclProxySubArgs { // Profiler plugin int eActivationMask; int rank; - uint64_t profilerSteps; pid_t pid; void* profilerContext; void* taskEventHandle; void* opEventHandle; void* kernelEventHandle; - void* stepEventHandles[NCCL_STEPS]; + struct ncclProxyEventHandle pHandles[NCCL_STEPS]; size_t transSize; uint64_t workCounter; @@ -226,6 +232,8 @@ struct ncclProxyPeer { }; struct ncclSharedNetComms { + int activeConnect[MAXCHANNELS]; + int activeAccept[MAXCHANNELS]; void* sendComm[MAXCHANNELS]; void* recvComm[MAXCHANNELS]; int sendRefCount[MAXCHANNELS]; diff --git a/src/include/register.h b/src/include/register.h index 143f41bc9..231cbfc34 100644 --- a/src/include/register.h +++ b/src/include/register.h @@ -29,18 +29,24 @@ struct ncclRegNetHandles { struct ncclRegNetHandles* next; }; +struct ncclSymRegTask { + struct ncclSymRegTask *next; + void* buff; + size_t baseSize; + CUmemGenericAllocationHandle memHandle; + struct ncclReg* regHandle; + size_t alignment; +}; + struct ncclReg { // common attributes - size_t pages; + uintptr_t begAddr, endAddr; // page aligned int localRefs; int graphRefs; - uintptr_t addr; uint32_t state; // net reg struct ncclRegNetHandles* netHandleHead; // nvls reg - uintptr_t baseAddr; - size_t baseSize; CUdeviceptr regAddr; size_t regUCSize, regMCSize; int dev; @@ -52,6 +58,10 @@ struct ncclReg { // general ipc reg struct ncclPeerRegIpcAddr regIpcAddrs; struct ncclIpcRegInfo* ipcInfos[NCCL_MAX_LOCAL_RANKS]; + // symmetric reg + void* baseSymPtr; + size_t symSize; + int winFlags; }; struct ncclRegCache { @@ -60,10 +70,14 @@ struct ncclRegCache { uintptr_t pageSize; }; +struct ncclWindow { + struct ncclReg* handle; +}; + ncclResult_t ncclRegCleanup(struct ncclComm* comm); -ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg); ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle); ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle); ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid); +ncclResult_t ncclCommSymmetricRegisterInternal(struct ncclComm* comm, void* buff, size_t baseSize, size_t alignment, CUmemGenericAllocationHandle memHandle, struct ncclReg* regHandle); #endif diff --git a/src/include/register_inline.h b/src/include/register_inline.h new file mode 100644 index 000000000..fb7641b13 --- /dev/null +++ b/src/include/register_inline.h @@ -0,0 +1,33 @@ +#ifndef NCCL_REGISTER_INLINE_H_ +#define NCCL_REGISTER_INLINE_H_ + +#include "comm.h" +#include "register.h" + +static inline ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** outReg) { + struct ncclRegCache* cache = &comm->regCache; + *outReg = NULL; + for (int slot=0; /*true*/; slot++) { + if (slot == cache->population) return ncclSuccess; + struct ncclReg *reg = cache->slots[slot]; + if ((uintptr_t)data < reg->begAddr) return ncclSuccess; + if ((uintptr_t)data + size <= reg->endAddr) { + *outReg = reg; + return ncclSuccess; + } + } +} + +static inline ncclResult_t ncclRegFindSymmetric(struct ncclComm* comm, const void* data, size_t size, void** symPtr, struct ncclReg** outReg) { + struct ncclReg* regRecord = NULL; + *symPtr = NULL; + *outReg = NULL; + NCCLCHECK(ncclRegFind(comm, data, size, ®Record)); + if (regRecord && regRecord->baseSymPtr) { + *symPtr = (void*)((uintptr_t)regRecord->baseSymPtr + (uintptr_t)data - (uintptr_t)regRecord->begAddr); + *outReg = regRecord; + } + return ncclSuccess; +} + +#endif diff --git a/src/include/socket.h b/src/include/socket.h index ffa148091..adeae9b2a 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -69,8 +69,10 @@ struct ncclSocket { const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1); ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair); -int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs); -int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs); +ncclResult_t ncclFindInterfaceMatchSubnet(char* ifName, union ncclSocketAddress* localAddr, + union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int* found); +ncclResult_t ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs, + int* nIfs); // Initialize a socket ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0, int customRetry = 0); diff --git a/src/include/symmetric.h b/src/include/symmetric.h new file mode 100644 index 000000000..7a189bcca --- /dev/null +++ b/src/include/symmetric.h @@ -0,0 +1,90 @@ +#ifndef NCCL_DEVICE_SYMMETRIC_H_ +#define NCCL_DEVICE_SYMMETRIC_H_ + +#include "nccl.h" +#include "nccl_common.h" +#include "bitops.h" + +constexpr int ncclSymMaxBlocks = 64; +constexpr int ncclSymMaxThreads = 512; +constexpr int ncclSymLLMaxEltSize = 64; + +constexpr __host__ __device__ int ncclSymLLMaxSlots(int eltSize = ncclSymLLMaxEltSize) { + return ncclSymMaxThreads*ncclSymLLMaxEltSize/eltSize; +} + +constexpr __host__ __device__ int ncclSymLLEpochSize(int nRanks) { + return /*LL Overhead*/2 * maxval(ncclSymMaxThreads*nRanks*8, ncclSymLLMaxSlots(ncclSymLLMaxEltSize)*ncclSymLLMaxEltSize); +} + +struct alignas(16) ncclSymDevBase { + uint32_t llEpoch[ncclSymMaxBlocks]; + uint32_t barEpochMc[ncclSymMaxBlocks], barEpochUc[ncclSymMaxBlocks]; + uint32_t barInboxMc[ncclSymMaxBlocks]; + uint32_t barInboxPerPeer[]; + + static constexpr size_t size(int nRanks) { + return sizeof(ncclSymDevBase) + + alignUp(ncclSymMaxBlocks*nRanks*sizeof(uint32_t), 16) + + ncclSymMaxBlocks * /*epochs=*/2 * ncclSymLLEpochSize(nRanks); + } +}; + +static __device__ uint4* ncclSymDevBase_getLLBuf(struct ncclSymDevBase* base, int nRanks, int block, uint32_t epoch) { + // Get pointer to buffer trailing the header struct. + char* ans = (char*)(base + 1); + // Skip over barInboxPerPeer[] + ans += alignUp(ncclSymMaxBlocks*nRanks*sizeof(uint32_t), 16); + // Skip to our block + int epochSize = ncclSymLLEpochSize(nRanks); + ans += block * /*epochs=*/2 * epochSize; + ans += (epoch & 1)*epochSize; + return (uint4*)ans; +} + +struct ncclSymDevComm { + ncclSymDevBase* base; + ncclSymDevBase* baseMc; + uint32_t stride4G; + int nRanks, rank; + uint32_t nRanks_rcp32; // idivRcp32(nRanks) +}; + +struct alignas(16) ncclSymDevArgs { + struct ncclSymDevComm comm; + int rootRank; + uint64_t redOpArg; // must be collectively uniform + size_t nElts; + char* input; + char* output; +}; + +enum ncclSymKernelId { + ncclSymKernelId_AllReduce_AGxLL_R, + ncclSymKernelId_AllReduce_AGxLLMC_R, + ncclSymKernelId_AllReduce_RSxLD_AGxST, + ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC, + + ncclSymKernelId_AllGather_LL, + ncclSymKernelId_AllGather_LLMC, + ncclSymKernelId_AllGather_ST, + ncclSymKernelId_AllGather_STMC, + + ncclSymKernelId_ReduceScatter_LL, + ncclSymKernelId_ReduceScatter_LD, + ncclSymKernelId_ReduceScatter_LDMC, + + ncclSymKernelId_Count +}; + +bool ncclSymImplemented(ncclFunc_t fn, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty); + +ncclResult_t ncclSymPickKernel(struct ncclComm* comm, ncclFunc_t fn, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, size_t nElts, float* estTimeUs, ncclSymKernelId* kernelId, int* nBlocks, int* nWarps); + +// Generated by src/device/symmetric/generate.py +extern int const ncclSymKernelCount; +extern void* const ncclSymKernelList[]; +void* ncclSymGetKernelPtr(ncclSymKernelId kernelId, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty); +const char* ncclSymKernelIdToString(int kernelId); + +#endif diff --git a/src/include/transport.h b/src/include/transport.h index c563fbbd6..a9971a74f 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -22,6 +22,7 @@ #include "proxy.h" #include "comm.h" +#include "bootstrap.h" extern struct ncclTransport p2pTransport; extern struct ncclTransport shmTransport; @@ -46,6 +47,7 @@ struct ncclPeerInfo { int64_t busId; struct ncclComm* comm; int cudaCompCap; + size_t totalGlobalMem; // MNNVL support nvmlGpuFabricInfoV_t fabricInfo; int cuMemSupport; @@ -53,6 +55,8 @@ struct ncclPeerInfo { }; #define CONNECT_SIZE 256 +#define NCCL_MAX_PAGE_SIZE (512L * 1024L * 1024L) +#define NCCL_REC_PAGE_SIZE (2L * 1024L * 1024L) struct ncclConnect { char data[CONNECT_SIZE]; }; @@ -80,6 +84,7 @@ struct ncclNvlsSharedRes { char* ucBuff; // Unicast NVLS buffer address char* ucCredit; // Unicast NVLS credit address int nChannels; + int nHeads; struct ncclShmemCollBuff nvlsShmem; void *nvlsShmemHandle; }; @@ -119,7 +124,8 @@ struct ncclTransport { ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex); ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex); -ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode); +ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* isAllDirectP2p, bool* directMode); +ncclResult_t ncclTransportIsAllDirectP2p(struct ncclComm* comm, int* isAllDirectP2p); ncclResult_t ncclNvlsInit(struct ncclComm* comm); ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent); @@ -154,5 +160,15 @@ ncclResult_t ncclRegisterP2pIpcBuffer(struct ncclComm* comm, void* userbuff, siz ncclResult_t ncclRegisterP2pNetBuffer(struct ncclComm* comm, void* userbuff, size_t size, struct ncclConnector* conn, int* regFlag, void** handle, struct ncclIntruQueue* cleanupQueue); ncclResult_t ncclRegisterCollBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue* cleanupQueue, bool* regNeedConnect); ncclResult_t ncclRegisterCollNvlsBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue* cleanupQueue, bool* regNeedConnect); +ncclResult_t ncclNvlsRegResourcesQuery(struct ncclComm* comm, struct ncclTaskColl* info, int* recChannels); + +ncclResult_t ncclIpcSymmetricInit(struct ncclComm* comm); +ncclResult_t ncclIpcSymmetricMap(struct ncclComm* comm, size_t offset, size_t size, CUmemGenericAllocationHandle memHandle, void** symPtr); +ncclResult_t ncclIpcSymmetricFree(struct ncclComm* comm, size_t size, void* symPtr); +ncclResult_t ncclIpcSymmetricFinalize(struct ncclComm* comm); +ncclResult_t ncclNvlsSymmetricInit(struct ncclComm* comm); +ncclResult_t ncclNvlsSymmetricMap(struct ncclComm* comm, size_t offset, size_t ucsize, void* ucaddr); +ncclResult_t ncclNvlsSymmetricFree(struct ncclComm* comm, size_t ucsize, void* ucaddr); +ncclResult_t ncclNvlsSymmetricFinalize(struct ncclComm* comm); #endif diff --git a/src/include/utils.h b/src/include/utils.h index 383f678c8..bfed2722c 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -43,6 +43,12 @@ static long log2i(long n) { return log2Down(n); } +// Comparator function for qsort/bsearch to compare integers +static int compareInts(const void *a, const void *b) { + int ia = *(const int*)a, ib = *(const int*)b; + return (ia > ib) - (ia < ib); +} + inline uint64_t clockNano() { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); diff --git a/src/init.cc b/src/init.cc index 47d7fa3c6..83764a883 100644 --- a/src/init.cc +++ b/src/init.cc @@ -18,6 +18,7 @@ #include "argcheck.h" #include "tuner.h" #include "ras.h" +#include "profiler.h" #include "mnnvl.h" #include #include @@ -29,6 +30,7 @@ #include #include "param.h" #include "nvtx_payload_schemas.h" +#include "utils.h" #define STR2(v) #v #define STR(v) STR2(v) @@ -48,6 +50,10 @@ NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM); NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); NCCL_PARAM(CommBlocking, "COMM_BLOCKING", NCCL_CONFIG_UNDEF_INT); NCCL_PARAM(RuntimeConnect, "RUNTIME_CONNECT", 1); +NCCL_PARAM(WinEnable, "WIN_ENABLE", 1); +NCCL_PARAM(CollnetEnable, "COLLNET_ENABLE", NCCL_CONFIG_UNDEF_INT); +NCCL_PARAM(CtaPolicy, "CTA_POLICY", NCCL_CONFIG_UNDEF_INT); +NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", NCCL_CONFIG_UNDEF_INT); static ncclResult_t commReclaim(ncclComm_t comm); @@ -174,6 +180,10 @@ static ncclResult_t commFree(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; + if (comm->symmetricSupport && comm->symDevComm.base) { + NCCLCHECK(ncclCommSymmetricFreeInternal(comm, comm->baseUCSymPtr + comm->rank * comm->baseStride)); + } + NCCLCHECK(ncclRasCommFini(comm)); /* in commReclaim, we have guaranteed only last rank which calls ncclCommDestroy() will @@ -253,15 +263,16 @@ static ncclResult_t commFree(ncclComm_t comm) { NCCLCHECK(ncclRegCleanup(comm)); + if (comm->symmetricSupport) { + NCCLCHECK(ncclNvlsSymmetricFinalize(comm)); + NCCLCHECK(ncclIpcSymmetricFinalize(comm)); + } INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - %s COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId, abort ? "Abort" : "Destroy"); commPoison(comm); // poison comm before free to avoid comm reuse. NCCLCHECK(ncclProfilerPluginFinalize(comm)); NCCLCHECK(ncclNetFinalize(comm)); - NCCLCHECK(ncclNetPluginUnload(comm)); - ncclCudaContextDrop(comm->context); - free(comm); return ncclSuccess; @@ -271,7 +282,7 @@ NCCL_PARAM(DisableGraphHelper, "GRAPH_HELPER_DISABLE", 0); // GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1); #define NCCL_WORK_FIFO_BYTES_DEFAULT (1<<20) -NCCL_PARAM(WorkFifoBytes, "WORK_FIFO_BYTES", -1); +NCCL_PARAM(WorkFifoBytes, "WORK_FIFO_BYTES", NCCL_WORK_FIFO_BYTES_DEFAULT); NCCL_PARAM(WorkArgsBytes, "WORK_ARGS_BYTES", INT64_MAX); enum ncclLaunchMode ncclParamLaunchMode; @@ -331,12 +342,10 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in comm->rank = rank; comm->nRanks = ndev; - NCCLCHECK(ncclNetPluginLoad(comm)); NCCLCHECK(ncclNetInit(comm)); - NCCLCHECK(ncclProfilerPluginInit(comm)); INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name); - if (parent && parent->config.splitShare) { + if (parent && parent->shareResources) { if (parent->ncclNet != comm->ncclNet) { WARN("Split shares resources, but parent comm netName %s is different from child comm netName %s", parent->ncclNet->name, comm->ncclNet->name); return ncclInvalidUsage; @@ -361,13 +370,14 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false; comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false; - comm->collNetSupport = 0; memset(comm->collNetSupportMatrix, 0, sizeof(comm->collNetSupportMatrix)); ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan); ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp); - comm->groupNext = reinterpret_cast(0x1); + for (int i = 0; i < ncclGroupTaskTypeNum; i++) { + comm->groupNext[i] = reinterpret_cast(0x1); + } comm->preconnectNext = reinterpret_cast(0x1); static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels"); @@ -378,7 +388,7 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in // Mark channels as non initialized. for (int c=0; c < MAXCHANNELS; c++) comm->channels[c].id = -1; - if (parent == NULL || !parent->config.splitShare) { + if (parent == NULL || !parent->shareResources) { struct ncclSharedResources* sharedRes = NULL; NCCLCHECK(ncclCalloc(&sharedRes, 1)); /* most of attributes are assigned later in initTransportsRank(). */ @@ -432,6 +442,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { bool ccEnable; cudaStream_t deviceStream; + memset(&tmpCommAndChans, '\0', sizeof(tmpCommAndChans)); NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail); NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, deviceStream), ret, fail); ncclCommPushCudaFree(comm, devCommAndChans); @@ -458,22 +469,12 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { if (ccEnable) { comm->workFifoBytes = 0; } else { - int64_t workFifoBytesParam = ncclParamWorkFifoBytes(); - if (workFifoBytesParam == -1) { - if (comm->MNNVL && (comm->compCap >= 100)) { - // WAR: Disable work fifo for Blackwell all2all hang issue on MNNVL - INFO(NCCL_INIT, "Disabling work fifo"); - comm->workFifoBytes = 0; - } else { - comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT; - } - } else { - if (0 != (workFifoBytesParam & (workFifoBytesParam-1))) { - WARN("NCCL_WORK_FIFO_BYTES=%ld is being ignored because it is not a power of 2.", workFifoBytesParam); - comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT; - } - comm->workFifoBytes = std::min(workFifoBytesParam, 1ul<<30); + comm->workFifoBytes = ncclParamWorkFifoBytes(); + if (0 != (comm->workFifoBytes & (comm->workFifoBytes-1))) { + WARN("NCCL_WORK_FIFO_BYTES=%d is being ignored because it is not a power of 2.", comm->workFifoBytes); + comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT; } + comm->workFifoBytes = std::min(comm->workFifoBytes, 1u<<30); } if (comm->rank == 0) { @@ -492,11 +493,9 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { comm->workFifoBufDev = comm->workFifoBuf; } - NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->workFifoConsumed, MAXCHANNELS), ret, fail); - ncclCommPushCudaHostFree(comm, comm->workFifoConsumed); comm->workFifoProduced = 0; - comm->workFifoConsumedLeast = 0; - tmpCommAndChans.comm.workConsumed = comm->workFifoConsumed; + comm->workFifoProducedLastRecorded = 0; + comm->workFifoConsumed = 0; // Alloc profiler counters for the kernel NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->profiler.workStarted, MAXCHANNELS), ret, fail); @@ -549,6 +548,7 @@ NCCL_PARAM(MNNVLUUID, "MNNVL_UUID", -1); NCCL_PARAM(MNNVLCliqueId, "MNNVL_CLIQUE_ID", -1); static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) { + cudaDeviceProp prop; info->rank = comm->rank; info->cudaDev = comm->cudaDev; info->nvmlDev = comm->nvmlDev; @@ -556,6 +556,8 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u info->hostHash=getHostHash()+commHash; info->pidHash=getPidHash()+commHash; info->cuMemSupport = ncclCuMemEnable(); + CUDACHECK(cudaGetDeviceProperties(&prop, comm->cudaDev)); + info->totalGlobalMem = ROUNDUP(prop.totalGlobalMem, (1L << 32)); // Get the device MAJOR:MINOR of /dev/shm so we can use that // information to decide whether we can use SHM for inter-process @@ -700,6 +702,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p struct ncclTopoRanks topoRanks; int cpuArch; int cpuVendor; + int localRanks; }; int nChannelsOrig; @@ -711,12 +714,14 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p struct ncclProxyConnector proxyConn; int* pxnPeers = NULL; int *topParentLocalRanks = NULL; + int p2pLevel = -1; timers[TIMER_INIT_ALLGATHER] = clockNano(); // AllGather1 - begin NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, comm->commHash), ret, fail); NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail); + __atomic_store_n(&comm->peerInfoValid, true, __ATOMIC_RELEASE); comm->cuMemSupport = 1; for (int i = 0; i < nranks; i++) { @@ -738,7 +743,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p timers[TIMER_INIT_ALLGATHER] = clockNano() - timers[TIMER_INIT_ALLGATHER]; // Check for MNNVL support - if ((nNodes > 1 && ncclParamMNNVLEnable() != 0) || ncclParamMNNVLEnable() == 1) { + NCCLCHECKGOTO(ncclGetUserP2pLevel(&p2pLevel), ret, fail); + if ((nNodes > 1 && ncclParamMNNVLEnable() != 0 && p2pLevel != 0) || ncclParamMNNVLEnable() == 1) { NCCLCHECKGOTO(ncclMnnvlCheck(comm), ret, fail); } @@ -829,14 +835,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p } // Determine local CollNet support - if (collNetSupport(comm)) { - const char *collNetEnable = ncclGetEnv("NCCL_COLLNET_ENABLE"); - if (collNetEnable != NULL) { - INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable); - if (strcmp(collNetEnable, "1") == 0) { - comm->collNetSupport = 1; - } - } + if (!collNetSupport(comm)) { + comm->config.collnetEnable = 0; } // Determine local Nvls support @@ -873,7 +873,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p collNetDirectGraph->collNet = 1; collNetDirectGraph->minChannels = 1; collNetDirectGraph->maxChannels = MAXCHANNELS; - if (comm->collNetSupport) { + if (comm->config.collnetEnable) { NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetChainGraph), ret, fail); NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetChainGraph), ret, fail); NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetDirectGraph), ret, fail); @@ -1014,7 +1014,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p } comm->maxTreePattern = std::max(comm->maxTreePattern, allGather3Data[i].graphInfo[NCCL_ALGO_TREE].pattern); } - if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0; + if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->config.collnetEnable = 0; if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0; comm->nChannels = treeGraph->nChannels = ringGraph->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels); @@ -1025,11 +1025,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p } // Determine CollNet support after all-gather now that we know nNodes and each node localRanks - if (comm->collNetSupport == 1) { + if (comm->config.collnetEnable == 1) { int collNetNodeThreshold = ncclParamCollNetNodeThreshold(); if (comm->nNodes < collNetNodeThreshold) { INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold); - comm->collNetSupport = 0; + comm->config.collnetEnable = 0; } } NCCLCHECK(ncclTopoPathAllNVLink(comm->topo, &comm->isAllNvlink)); @@ -1075,9 +1075,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p } comm->topParentLocalRanks = topParentLocalRanks; - NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->intraNodeP2pSupport, &comm->directMode), ret, fail); + // Profiler plugin context has to be initialized before proxy thread + NCCLCHECK(ncclProfilerPluginInit(comm)); + + NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->isAllDirectP2p, &comm->directMode), ret, fail); // Launch proxy service thread, after this, the proxy calls can be used. - if (parent && parent->config.splitShare) { + if (parent && parent->shareResources) { comm->proxyState = parent->sharedRes->proxyState; ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount); } else { @@ -1147,10 +1150,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p for (int c=0; cnChannels; c++) { NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail); } - // Setup NVLS + // Attempt to setup NVLS, may silently fail and disable NVLS NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail); // Check if we can setup CollNet - if (comm->collNetSupport > 0) ncclCollNetSetup(comm, parent, graphs); + if (comm->config.collnetEnable) ncclCollNetSetup(comm, parent, graphs); } else { for (int c=0; cnChannels; c++) { NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail); @@ -1163,7 +1166,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p // Connect PAT only for communicators with 1 GPU per node if (comm->maxLocalRanks == 1) NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail); - // Setup NVLS + // Attempt to setup NVLS, may silently fail and disable NVLS NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail); NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail); @@ -1171,7 +1174,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail); // Check if we can setup CollNet - if (comm->collNetSupport > 0) { + if (comm->config.collnetEnable) { ncclCollNetSetup(comm, parent, graphs); NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail); if (comm->maxLocalRanks <= NCCL_MAX_DIRECT_ARITY+1) { @@ -1244,9 +1247,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p } } + comm->symmetricSupport = comm->isAllDirectP2p && comm->nNodes == 1 && ncclParamWinEnable() && ncclCuMemEnable(); + comm->baseStride = 0; + // Call devCommSetup before the last barrier, making sure we don't have a thread running in front and starting to // launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock. NCCLCHECKGOTO(devCommSetup(comm), ret, fail); + timers[TIMER_INIT_CONNECT] = clockNano() - timers[TIMER_INIT_CONNECT]; /* Local intra-node barrier */ NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail); @@ -1260,7 +1267,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p /* If split resource is shared, we are not able to unlink the proxy ops pool here since the child comm can * attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be * properly cleaned up. */ - if (comm->sharedRes->owner == comm && !comm->config.splitShare && ret == ncclSuccess && !ncclCuMemEnable()) ncclProxyShmUnlink(comm); + if (comm->sharedRes->owner == comm && !comm->shareResources && ret == ncclSuccess && !ncclCuMemEnable()) ncclProxyShmUnlink(comm); free(allTopoRanks); free(nodesTreePatterns); free(nodesFirstRank); @@ -1293,6 +1300,9 @@ struct ncclCommInitRankAsyncJob { struct ncclComm* parent; int color, key; int splitCount; + // For Shrink + int* excludeRanksList; + int excludeRanksCount; // name of the function calling char funcName[NCCL_COMMINIT_FUNCNAME_LEN]; }; @@ -1303,6 +1313,7 @@ struct ncclCommFinalizeAsyncJob { }; NCCL_PARAM(CommSplitShareResources, "COMM_SPLIT_SHARE_RESOURCES", NCCL_CONFIG_UNDEF_INT); +NCCL_PARAM(CommShrinkShareResources, "COMM_SHRINK_SHARE_RESOURCES", NCCL_CONFIG_UNDEF_INT); typedef struct{ int key; @@ -1350,6 +1361,21 @@ static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* par goto exit; } +static ncclResult_t getParentRanks(int parentRanks, int parentRank, int* excludeRanksList, int excludeRanksCount, int* nRanksRet, int* myRankRet, int* parentRanksRet) { + int count = 0, j = 0; + for (int i = 0; i < parentRanks; i++) { + // we assume excludeRanksList is sorted + if (j < excludeRanksCount && excludeRanksList[j] == i) { + j++; + continue; + } + if (i == parentRank) *myRankRet = count; + parentRanksRet[count++] = i; + } + *nRanksRet = parentRanks - excludeRanksCount; + return ncclSuccess; +} + static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_; ncclComm_t comm = job->comm; @@ -1383,9 +1409,13 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { if (job->parent) { NCCLCHECKGOTO(ncclCalloc(&parentRanks, job->parent->nRanks), res, fail); - NCCLCHECKGOTO(commGetSplitInfo(comm, job->parent, job->color, job->key, &job->nranks, &job->myrank, parentRanks), res, fail); - // Negative color does not create a new comm object. We needed to take part in the allgather, but we're done now. - if (job->color == NCCL_SPLIT_NOCOLOR) goto exit; + if (job->excludeRanksCount) { + NCCLCHECKGOTO(getParentRanks(job->parent->nRanks, job->parent->rank, job->excludeRanksList, job->excludeRanksCount, &job->nranks, &job->myrank, parentRanks), res, fail); + } else { + NCCLCHECKGOTO(commGetSplitInfo(comm, job->parent, job->color, job->key, &job->nranks, &job->myrank, parentRanks), res, fail); + // Negative color does not create a new comm object. We needed to take part in the allgather, but we're done now. + if (job->color == NCCL_SPLIT_NOCOLOR) goto exit; + } timers[TIMER_INIT_ALLOC] = clockNano(); NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail); timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC]; @@ -1477,6 +1507,10 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) { int minCTAsEnv; int maxCTAsEnv; int splitShareEnv; + int collnetEnableEnv; + int ctaPolicyEnv; + int shrinkShareEnv; + int nvlsCTAsEnv; /* override configuration from env variable. */ blockingEnv = ncclParamCommBlocking(); @@ -1522,6 +1556,25 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) { if (splitShareEnv != NCCL_CONFIG_UNDEF_INT) { comm->config.splitShare = splitShareEnv; } + shrinkShareEnv = ncclParamCommShrinkShareResources(); + if (shrinkShareEnv != NCCL_CONFIG_UNDEF_INT) { + comm->config.shrinkShare = shrinkShareEnv; + } + + collnetEnableEnv = ncclParamCollnetEnable(); + if (collnetEnableEnv != NCCL_CONFIG_UNDEF_INT) { + comm->config.collnetEnable = collnetEnableEnv; + } + + ctaPolicyEnv = ncclParamCtaPolicy(); + if (ctaPolicyEnv != NCCL_CONFIG_UNDEF_INT) { + comm->config.CTAPolicy = ctaPolicyEnv; + } + + nvlsCTAsEnv = ncclParamNvlsChannels(); + if (nvlsCTAsEnv != NCCL_CONFIG_UNDEF_INT) { + comm->config.nvlsCTAs = nvlsCTAsEnv; + } /* cap channels if needed */ if (comm->config.minCTAs > MAXCHANNELS) { @@ -1544,6 +1597,20 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) { comm->config.splitShare = 0; } + if (comm->config.collnetEnable != 1 && comm->config.collnetEnable != 0) { + INFO(NCCL_ENV, "collnetEnable %d is not a valid value 0/1, set it to 0", comm->config.collnetEnable); + comm->config.collnetEnable = 0; + } + + if (comm->config.CTAPolicy < NCCL_CTA_POLICY_DEFAULT || comm->config.CTAPolicy > NCCL_CTA_POLICY_EFFICIENCY) { + INFO(NCCL_ENV, "CTAPolicy %d is not a valid value, set it to %d", comm->config.CTAPolicy, NCCL_CTA_POLICY_DEFAULT); + comm->config.CTAPolicy = NCCL_CTA_POLICY_DEFAULT; + } + + if (comm->config.nvlsCTAs != NCCL_CONFIG_UNDEF_INT && comm->config.nvlsCTAs <= 0) { + INFO(NCCL_ENV, "nvlsCTAs %d is not a valid value, NCCL will decide the default value automatically", comm->config.nvlsCTAs); + comm->config.nvlsCTAs = NCCL_CONFIG_UNDEF_INT; + } return ret; } @@ -1584,6 +1651,17 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { internalConfigPtr->maxCTAs = defaultConfig.maxCTAs; internalConfigPtr->netName = defaultConfig.netName; } + + if (internalConfigPtr->version < NCCL_VERSION(2, 25, 0)) { + internalConfigPtr->trafficClass = defaultConfig.trafficClass; + } + + if (internalConfigPtr->version < NCCL_VERSION(2, 27, 0)) { + internalConfigPtr->collnetEnable = defaultConfig.collnetEnable; + internalConfigPtr->CTAPolicy = defaultConfig.CTAPolicy; + internalConfigPtr->shrinkShare = defaultConfig.shrinkShare; + internalConfigPtr->nvlsCTAs = defaultConfig.nvlsCTAs; + } } /* check input config attributes, -1 means user-undefined and we should use default value from NCCL. */ @@ -1615,6 +1693,31 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { goto fail; } + if (internalConfigPtr->collnetEnable != NCCL_CONFIG_UNDEF_INT && (internalConfigPtr->collnetEnable < 0 || internalConfigPtr->collnetEnable > 1)) { + WARN("Invalid config collnetEnable attribute value %d", internalConfigPtr->collnetEnable); + ret = ncclInvalidArgument; + goto fail; + } + + if (internalConfigPtr->CTAPolicy != NCCL_CONFIG_UNDEF_INT && (internalConfigPtr->CTAPolicy < NCCL_CTA_POLICY_DEFAULT || + internalConfigPtr->CTAPolicy > NCCL_CTA_POLICY_EFFICIENCY)) { + WARN("Invalid config policy attribute value %d", internalConfigPtr->CTAPolicy); + ret = ncclInvalidArgument; + goto fail; + } + + if (internalConfigPtr->shrinkShare != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->shrinkShare != 0 && internalConfigPtr->shrinkShare != 1) { + WARN("Invalid config shrinkShare attribute value %d", internalConfigPtr->shrinkShare); + ret = ncclInvalidArgument; + goto fail; + } + + if (internalConfigPtr->nvlsCTAs != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->nvlsCTAs <= 0) { + WARN("Invalid config nvlsCTAs attribute value %d", internalConfigPtr->nvlsCTAs); + ret = ncclInvalidArgument; + goto fail; + } + /* default config value can be tuned on different platform. */ NCCL_CONFIG_DEFAULT(internalConfigPtr, blocking, NCCL_CONFIG_UNDEF_INT, 1, "Blocking", "%d"); NCCL_CONFIG_DEFAULT(internalConfigPtr, cgaClusterSize, NCCL_CONFIG_UNDEF_INT, 4, "CGA cluster size", "%d"); @@ -1623,6 +1726,11 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { NCCL_CONFIG_DEFAULT(internalConfigPtr, netName, NCCL_CONFIG_UNDEF_PTR, NULL, "Net name", "%s"); NCCL_CONFIG_DEFAULT(internalConfigPtr, splitShare, NCCL_CONFIG_UNDEF_INT, 0, "Split share", "%d"); NCCL_CONFIG_DEFAULT(internalConfigPtr, trafficClass, NCCL_CONFIG_UNDEF_INT, NCCL_CONFIG_UNDEF_INT, "Traffic class", "%d"); + NCCL_CONFIG_DEFAULT(internalConfigPtr, commName, NCCL_CONFIG_UNDEF_PTR, NULL, "Comm name", "%s"); + NCCL_CONFIG_DEFAULT(internalConfigPtr, collnetEnable, NCCL_CONFIG_UNDEF_INT, 0, "Collnet enable", "%d"); + NCCL_CONFIG_DEFAULT(internalConfigPtr, CTAPolicy, NCCL_CONFIG_UNDEF_INT, NCCL_CTA_POLICY_DEFAULT, "CTA policy flags", "%d"); + NCCL_CONFIG_DEFAULT(internalConfigPtr, shrinkShare, NCCL_CONFIG_UNDEF_INT, 0, "shrinkShare", "%d"); + NCCL_CONFIG_DEFAULT(internalConfigPtr, nvlsCTAs, NCCL_CONFIG_UNDEF_INT, NCCL_CONFIG_UNDEF_INT, "nvlsCTAs", "%d"); /* assign config to communicator */ comm->config.blocking = internalConfigPtr->blocking; @@ -1632,7 +1740,11 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { comm->config.netName = internalConfigPtr->netName; comm->config.splitShare = internalConfigPtr->splitShare; comm->config.trafficClass = internalConfigPtr->trafficClass; - + comm->config.commName = internalConfigPtr->commName; + comm->config.collnetEnable = internalConfigPtr->collnetEnable; + comm->config.CTAPolicy = internalConfigPtr->CTAPolicy; + comm->config.shrinkShare = internalConfigPtr->shrinkShare; + comm->config.nvlsCTAs = internalConfigPtr->nvlsCTAs; NCCLCHECKGOTO(envConfigOverride(comm), ret, fail); exit: @@ -1909,7 +2021,7 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) { WARN("commDestroySync: comm %p rank %d sync deviceStream error %d\n", comm, comm->rank, ret); } - NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), ret, fail); + NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm, true), ret, fail); NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail); // And keep polling until all graphs referencing us die. while (comm->localPersistentRefs != 0) { @@ -2052,7 +2164,6 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) { NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev)); TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId); - NCCLCHECK(ncclGroupStartInternal()); // Try and prevent a double free of the comm struct (user error) if (comm->rank == -1 || comm->nRanks == -1 || comm->cudaDev == -1 || comm->busId == -1) { WARN("comm %p has already been destroyed", comm); @@ -2067,13 +2178,22 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) { NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail); exit: - ncclGroupErrCheck(res); - NCCLCHECK(ncclGroupEndInternal()); return res; fail: goto exit; } +static ncclResult_t setCommAbortFlags(ncclComm_t comm, int value) { + // Set abort flags + if (comm->childAbortFlag != nullptr) { + __atomic_store_n(comm->childAbortFlag, value, __ATOMIC_RELEASE); + __atomic_store_n(comm->childAbortFlagDev, value, __ATOMIC_RELEASE); + } + __atomic_store_n(comm->abortFlag, value, __ATOMIC_RELEASE); + __atomic_store_n(comm->abortFlagDev, value, __ATOMIC_RELEASE); + return ncclSuccess; +} + NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm); ncclResult_t ncclCommAbort(ncclComm_t comm) { NVTX3_RANGE(NcclNvtxParamsCommAbort); @@ -2081,14 +2201,8 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) { if (comm == NULL) { return ncclSuccess; } - NCCLCHECK(ncclGroupStartInternal()); // Ask anything that might still be running on the device to quit - if (comm->childAbortFlag != nullptr) { - __atomic_store_n(comm->childAbortFlag, 1, __ATOMIC_RELEASE); - __atomic_store_n(comm->childAbortFlagDev, 1, __ATOMIC_RELEASE); - } - __atomic_store_n(comm->abortFlag, 1, __ATOMIC_RELEASE); - __atomic_store_n(comm->abortFlagDev, 1, __ATOMIC_RELEASE); + NCCLCHECK(setCommAbortFlags(comm,1)); comm->destroyFlag = 1; /* init thread must be joined before we destroy the comm, * and we should ignore the init error here. */ @@ -2109,38 +2223,51 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) { NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail); exit: - ncclGroupErrCheck(res); - NCCLCHECK(ncclGroupEndInternal()); return ncclSuccess; fail: goto exit; } -NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config); -ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) { +static void childCommCleanupJob(void* job) { + struct ncclCommInitRankAsyncJob* initJob = (struct ncclCommInitRankAsyncJob*)job; + if (initJob->excludeRanksList) free(initJob->excludeRanksList); + free(job); +} + +// initializing a child communicator (for both split and shrink) +static ncclResult_t ncclCommInitChildComm(ncclComm_t comm, ncclComm_t* newcomm, bool isShrink, int flags, int color, int key, int* excludeRanksList, int excludeRanksCount, + ncclConfig_t* config, const char* caller) { struct ncclCommInitRankAsyncJob *job = NULL; struct ncclComm* childComm = NCCL_COMM_NULL; ncclResult_t res = ncclSuccess; - NVTX3_RANGE(NcclNvtxParamsCommSplit) - int oldDev; CUDACHECK(cudaGetDevice(&oldDev)); + NCCLCHECKGOTO(CommCheck(comm, caller, "comm"), res, exit); + NCCLCHECKGOTO(PtrCheck(newcomm, caller, "newcomm"), res, exit); + if (isShrink) { + NCCLCHECKGOTO(PtrCheck(excludeRanksList, caller, "excludeRanksList"), res, exit); + NCCLCHECKGOTO(excludeRanksCount > 0 ? ncclSuccess : ncclInvalidArgument, res, exit); + // excludeRanksList may not be sorted, need to sort it + qsort(excludeRanksList, excludeRanksCount, sizeof(int), compareInts); + // ranks in excludeRanksList should not call into this function + NCCLCHECKGOTO(bsearch(&comm->rank, excludeRanksList, excludeRanksCount, sizeof(int), compareInts) ? ncclInvalidArgument : ncclSuccess, res, exit); + } + NCCLCHECKGOTO(ncclCommEnsureReady(comm), res, exit); + CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), res, exit); - NCCLCHECK(ncclGroupStartInternal()); - NCCLCHECKGOTO(CommCheck(comm, "CommSplit", "comm"), res, fail); - NCCLCHECKGOTO(PtrCheck(newcomm, "CommSplit", "newcomm"), res, fail); - NCCLCHECKGOTO(ncclCommEnsureReady(comm), res, fail); - - CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), res, fail); /* *newcomm should be NCCL_COMM_NULL until comm split fully complete. */ *newcomm = NCCL_COMM_NULL; - if (color == NCCL_SPLIT_NOCOLOR) { + if (!isShrink && color == NCCL_SPLIT_NOCOLOR) { INFO(NCCL_INIT, "Rank %d has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator", comm->rank); } else { NCCLCHECKGOTO(ncclCalloc(&childComm, 1), res, fail); childComm->startMagic = childComm->endMagic = NCCL_MAGIC; - if (comm->config.splitShare) { + + // Set the shareResource field, this is used throughout the init and must be reset every time. + // If we shrink, we only reuse resources if we shrink in the default mode + comm->shareResources = isShrink ? (!(flags & NCCL_SHRINK_ABORT) && comm->config.shrinkShare) : comm->config.splitShare; + if (comm->shareResources) { childComm->abortFlag = comm->abortFlag; childComm->abortFlagDev = comm->abortFlagDev; childComm->abortFlagRefCount = comm->abortFlagRefCount; @@ -2161,38 +2288,39 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc NCCLCHECKGOTO(parseCommConfig(childComm, config), res, fail); } - /* start with ncclInProgress and will be changed to ncclSuccess if init succeeds. */ - childComm->initState = ncclInProgress; + /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */ + childComm->initState = ncclInternalError; } NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail); job->comm = childComm; job->newcomm = newcomm; job->parent = comm; - job->splitCount = ++comm->splitCount; job->color = color; job->key = key; + if (excludeRanksList) { + // need to copy the list of ranks to exclude because the job is async + job->excludeRanksCount = excludeRanksCount; + NCCLCHECKGOTO(ncclCalloc(&job->excludeRanksList, excludeRanksCount), res, fail); + memcpy(job->excludeRanksList, excludeRanksList, excludeRanksCount * sizeof(int)); + } else { + // each split has to lead to a unique comm, so increment the splitCount + job->splitCount = ++comm->splitCount; + job->excludeRanksList = NULL; + } job->cudaDev = comm->cudaDev; - snprintf(job->funcName, NCCL_COMMINIT_FUNCNAME_LEN, "%s", __func__); - NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, free, comm), res, fail); + snprintf(job->funcName, NCCL_COMMINIT_FUNCNAME_LEN, "%s", caller); + NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, /*undo=*/NULL, /*destructor=*/childCommCleanupJob, comm), res, fail); exit: (void)cudaSetDevice(oldDev); - (void)ncclGroupErrCheck(res); - NCCLCHECK(ncclGroupEndInternal()); - - if (res == ncclSuccess && *newcomm) { - NVTX3_RANGE_ADD_PAYLOAD(CommSplit, NcclNvtxParamsCommSplitSchema, - NVTX3_PAYLOAD((*newcomm)->commHash, comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, color, key)); - } - return res; fail: if (childComm) { - if (!comm->config.splitShare) { - free(childComm->abortFlag); + if (!comm->shareResources) { + if (childComm->abortFlag) free(childComm->abortFlag); if (childComm->abortFlagDev) ncclCudaHostFree(childComm->abortFlagDev); - free(childComm->abortFlagRefCount); + if (childComm->abortFlagRefCount) free(childComm->abortFlagRefCount); } free(childComm); } @@ -2200,6 +2328,44 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc goto exit; } +NCCL_API(ncclResult_t, ncclCommShrink, ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags); +ncclResult_t ncclCommShrink(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t *newcomm, ncclConfig_t* config, int shrinkFlags) { + NVTX3_RANGE(NcclNvtxParamsCommShrink) + ncclResult_t res = ncclSuccess; + NCCLCHECK(ncclGroupStartInternal()); + // Handle error mode by setting abort flags and waiting for kernels to complete and unset the flags to avoid bootstrap issues + if (shrinkFlags & NCCL_SHRINK_ABORT) { + NCCLCHECKGOTO(setCommAbortFlags(comm, 1), res, exit); + NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream), res, exit); + NCCLCHECKGOTO(setCommAbortFlags(comm, 0), res, exit); + } + NCCLCHECKGOTO(ncclCommInitChildComm(comm, newcomm, /*isShrink=*/true, shrinkFlags, /*color=*/0, /*key=*/comm->rank, excludeRanksList, excludeRanksCount, config, __func__), res, exit); + + if (*newcomm) NVTX3_RANGE_ADD_PAYLOAD(CommShrink, NcclNvtxParamsCommShrinkSchema, NVTX3_PAYLOAD(comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, excludeRanksCount)); + +exit: + (void)ncclGroupErrCheck(res); + NCCLCHECK(ncclGroupEndInternal()); + return res; +} + +NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config); +ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) { + NVTX3_RANGE(NcclNvtxParamsCommSplit) + + ncclResult_t res = ncclSuccess; + NCCLCHECK(ncclGroupStartInternal()); + NCCLCHECKGOTO(ncclCommInitChildComm(comm, newcomm, /*isShrink=*/false, /*shrink mode=*/NCCL_SHRINK_DEFAULT, color, key, NULL, 0, config, __func__), res, exit); + + if (*newcomm) + NVTX3_RANGE_ADD_PAYLOAD(CommSplit, NcclNvtxParamsCommSplitSchema, NVTX3_PAYLOAD((*newcomm)->commHash, comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, color, key)); + +exit: + (void)ncclGroupErrCheck(res); + NCCLCHECK(ncclGroupEndInternal()); + return res; +} + NCCL_API(const char*, ncclGetErrorString, ncclResult_t code); const char* ncclGetErrorString(ncclResult_t code) { switch (code) { @@ -2277,119 +2443,3 @@ ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) { *rank = comm->rank; return ncclSuccess; } - -NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size); -ncclResult_t ncclMemAlloc(void **ptr, size_t size) { - NVTX3_FUNC_RANGE_IN(nccl_domain); - ncclResult_t ret = ncclSuccess; - -#if CUDART_VERSION >= 12010 - size_t memGran = 0; - CUdevice currentDev; - CUmemAllocationProp memprop = {}; - CUmemAccessDesc accessDesc = {}; - CUmemGenericAllocationHandle handle; - int cudaDev; - int flag; - int dcnt; - - if (ptr == NULL || size == 0) goto fallback; - - if (ncclCudaLibraryInit() != ncclSuccess) goto fallback; - - CUDACHECK(cudaGetDevice(&cudaDev)); - CUCHECK(cuDeviceGet(¤tDev, cudaDev)); - - if (ncclCuMemEnable()) { - size_t handleSize = size; - int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; - // Query device to see if FABRIC handle support is available - flag = 0; - (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev)); - if (flag) requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC; - memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes; - memprop.location.id = currentDev; - // Query device to see if RDMA support is available - flag = 0; - CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev)); - if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1; - CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); - CUDACHECK(cudaGetDeviceCount(&dcnt)); - ALIGN_SIZE(handleSize, memGran); - - if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) { - /* First try cuMemCreate() with FABRIC handle support and then remove if it fails */ - CUresult err = CUPFN(cuMemCreate(&handle, handleSize, &memprop, 0)); - if (err == CUDA_ERROR_NOT_PERMITTED || err == CUDA_ERROR_NOT_SUPPORTED) { - requestedHandleTypes &= ~CU_MEM_HANDLE_TYPE_FABRIC; - memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes; - /* Allocate the physical memory on the device */ - CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0)); - } - } else { - /* Allocate the physical memory on the device */ - CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0)); - } - /* Reserve a virtual address range */ - CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, handleSize, memGran, 0, 0)); - /* Map the virtual address range to the physical allocation */ - CUCHECK(cuMemMap((CUdeviceptr)*ptr, handleSize, 0, handle, 0)); - /* Now allow RW access to the newly mapped memory */ - for (int i = 0; i < dcnt; ++i) { - int p2p = 0; - if (i == cudaDev || ((cudaDeviceCanAccessPeer(&p2p, cudaDev, i) == cudaSuccess) && p2p)) { - accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - accessDesc.location.id = i; - accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, handleSize, &accessDesc, 1)); - } - if (0 == p2p && i != cudaDev) INFO(NCCL_ALLOC, "P2P not supported between GPU%d and GPU%d", cudaDev, i); - } - goto exit; - } - -fallback: -#endif - // Coverity is right to complain that we may pass a NULL ptr to cudaMalloc. That's deliberate though: - // we want CUDA to return an error to the caller. - // coverity[var_deref_model] - CUDACHECKGOTO(cudaMalloc(ptr, size), ret, fail); - -exit: - return ret; -fail: - goto exit; -} - -NCCL_API(ncclResult_t, ncclMemFree, void *ptr); -ncclResult_t ncclMemFree(void *ptr) { - NVTX3_FUNC_RANGE_IN(nccl_domain); - ncclResult_t ret = ncclSuccess; - int saveDevice; - - CUDACHECK(cudaGetDevice(&saveDevice)); -#if CUDART_VERSION >= 12010 - CUdevice ptrDev = 0; - - if (ptr == NULL) goto fallback; - if (ncclCudaLibraryInit() != ncclSuccess) goto fallback; - - CUCHECKGOTO(cuPointerGetAttribute((void*)&ptrDev, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr), ret, fail); - CUDACHECKGOTO(cudaSetDevice((int)ptrDev), ret, fail); - if (ncclCuMemEnable()) { - NCCLCHECKGOTO(ncclCuMemFree(ptr), ret, fail); - goto exit; - } - -fallback: -#endif - CUDACHECKGOTO(cudaFree(ptr), ret, fail); - -exit: - CUDACHECK(cudaSetDevice(saveDevice)); - return ret; -fail: - goto exit; -} diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc index 64a84f556..5b66fea92 100644 --- a/src/misc/cudawrap.cc +++ b/src/misc/cudawrap.cc @@ -105,53 +105,53 @@ int ncclCuMemHostEnable() { #endif } -#define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr +#define DECLARE_CUDA_PFN(symbol,version) PFN_##symbol##_v##version pfn_##symbol = nullptr #if CUDART_VERSION >= 11030 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */ -DECLARE_CUDA_PFN(cuDeviceGet); -DECLARE_CUDA_PFN(cuDeviceGetAttribute); -DECLARE_CUDA_PFN(cuGetErrorString); -DECLARE_CUDA_PFN(cuGetErrorName); +DECLARE_CUDA_PFN(cuDeviceGet, 2000); +DECLARE_CUDA_PFN(cuDeviceGetAttribute, 2000); +DECLARE_CUDA_PFN(cuGetErrorString, 6000); +DECLARE_CUDA_PFN(cuGetErrorName, 6000); /* enqueue.cc */ -DECLARE_CUDA_PFN(cuMemGetAddressRange); -DECLARE_CUDA_PFN(cuLaunchKernel); +DECLARE_CUDA_PFN(cuMemGetAddressRange, 3020); +DECLARE_CUDA_PFN(cuLaunchKernel, 4000); #if CUDA_VERSION >= 11080 -DECLARE_CUDA_PFN(cuLaunchKernelEx); +DECLARE_CUDA_PFN(cuLaunchKernelEx, 11060); #endif /* proxy.cc */ -DECLARE_CUDA_PFN(cuCtxCreate); -DECLARE_CUDA_PFN(cuCtxDestroy); -DECLARE_CUDA_PFN(cuCtxGetCurrent); -DECLARE_CUDA_PFN(cuCtxSetCurrent); -DECLARE_CUDA_PFN(cuCtxGetDevice); +DECLARE_CUDA_PFN(cuCtxCreate, 11040); +DECLARE_CUDA_PFN(cuCtxDestroy, 4000); +DECLARE_CUDA_PFN(cuCtxGetCurrent, 4000); +DECLARE_CUDA_PFN(cuCtxSetCurrent, 4000); +DECLARE_CUDA_PFN(cuCtxGetDevice, 2000); /* cuMem API support */ -DECLARE_CUDA_PFN(cuMemAddressReserve); -DECLARE_CUDA_PFN(cuMemAddressFree); -DECLARE_CUDA_PFN(cuMemCreate); -DECLARE_CUDA_PFN(cuMemGetAllocationGranularity); -DECLARE_CUDA_PFN(cuMemExportToShareableHandle); -DECLARE_CUDA_PFN(cuMemImportFromShareableHandle); -DECLARE_CUDA_PFN(cuMemMap); -DECLARE_CUDA_PFN(cuMemRelease); -DECLARE_CUDA_PFN(cuMemRetainAllocationHandle); -DECLARE_CUDA_PFN(cuMemSetAccess); -DECLARE_CUDA_PFN(cuMemUnmap); -DECLARE_CUDA_PFN(cuMemGetAllocationPropertiesFromHandle); +DECLARE_CUDA_PFN(cuMemAddressReserve, 10020); +DECLARE_CUDA_PFN(cuMemAddressFree, 10020); +DECLARE_CUDA_PFN(cuMemCreate, 10020); +DECLARE_CUDA_PFN(cuMemGetAllocationGranularity, 10020); +DECLARE_CUDA_PFN(cuMemExportToShareableHandle, 10020); +DECLARE_CUDA_PFN(cuMemImportFromShareableHandle, 10020); +DECLARE_CUDA_PFN(cuMemMap, 10020); +DECLARE_CUDA_PFN(cuMemRelease, 10020); +DECLARE_CUDA_PFN(cuMemRetainAllocationHandle, 11000); +DECLARE_CUDA_PFN(cuMemSetAccess, 10020); +DECLARE_CUDA_PFN(cuMemUnmap, 10020); +DECLARE_CUDA_PFN(cuMemGetAllocationPropertiesFromHandle, 10020); /* ncclMemAlloc/Free */ -DECLARE_CUDA_PFN(cuPointerGetAttribute); +DECLARE_CUDA_PFN(cuPointerGetAttribute, 4000); #if CUDA_VERSION >= 11070 /* transport/collNet.cc/net.cc*/ -DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange); // DMA-BUF support +DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support #endif #if CUDA_VERSION >= 12010 /* NVSwitch Multicast support */ -DECLARE_CUDA_PFN(cuMulticastAddDevice); -DECLARE_CUDA_PFN(cuMulticastBindMem); -DECLARE_CUDA_PFN(cuMulticastBindAddr); -DECLARE_CUDA_PFN(cuMulticastCreate); -DECLARE_CUDA_PFN(cuMulticastGetGranularity); -DECLARE_CUDA_PFN(cuMulticastUnbind); +DECLARE_CUDA_PFN(cuMulticastAddDevice, 12010); +DECLARE_CUDA_PFN(cuMulticastBindMem, 12010); +DECLARE_CUDA_PFN(cuMulticastBindAddr, 12010); +DECLARE_CUDA_PFN(cuMulticastCreate, 12010); +DECLARE_CUDA_PFN(cuMulticastGetGranularity, 12010); +DECLARE_CUDA_PFN(cuMulticastUnbind, 12010); #endif #endif @@ -162,8 +162,17 @@ bool ncclCudaLaunchBlocking = false; #if CUDART_VERSION >= 11030 -#if CUDART_VERSION >= 12000 -#define LOAD_SYM(symbol, ignore) do { \ +#if CUDART_VERSION >= 13000 +#define LOAD_SYM(symbol, version, ignore) do { \ + cudaDriverEntryPointQueryResult driverStatus = cudaDriverEntryPointSymbolNotFound; \ + res = cudaGetDriverEntryPointByVersion(#symbol, (void **) (&pfn_##symbol), version, cudaEnableDefault, &driverStatus); \ + if (res != cudaSuccess || driverStatus != cudaDriverEntryPointSuccess) { \ + if (!ignore) { \ + WARN("Retrieve %s version %d failed with %d status %d", #symbol, version, res, driverStatus); \ + return ncclSystemError; } \ + } } while(0) +#elif CUDART_VERSION >= 12000 +#define LOAD_SYM(symbol, version, ignore) do { \ cudaDriverEntryPointQueryResult driverStatus = cudaDriverEntryPointSymbolNotFound; \ res = cudaGetDriverEntryPoint(#symbol, (void **) (&pfn_##symbol), cudaEnableDefault, &driverStatus); \ if (res != cudaSuccess || driverStatus != cudaDriverEntryPointSuccess) { \ @@ -172,7 +181,7 @@ bool ncclCudaLaunchBlocking = false; return ncclSystemError; } \ } } while(0) #else -#define LOAD_SYM(symbol, ignore) do { \ +#define LOAD_SYM(symbol, version, ignore) do { \ res = cudaGetDriverEntryPoint(#symbol, (void **) (&pfn_##symbol), cudaEnableDefault); \ if (res != cudaSuccess) { \ if (!ignore) { \ @@ -188,46 +197,46 @@ static ncclResult_t cudaPfnFuncLoader(void) { cudaError_t res; - LOAD_SYM(cuGetErrorString, 0); - LOAD_SYM(cuGetErrorName, 0); - LOAD_SYM(cuDeviceGet, 0); - LOAD_SYM(cuDeviceGetAttribute, 0); - LOAD_SYM(cuMemGetAddressRange, 1); - LOAD_SYM(cuCtxCreate, 1); - LOAD_SYM(cuCtxDestroy, 1); - LOAD_SYM(cuCtxGetCurrent, 1); - LOAD_SYM(cuCtxSetCurrent, 1); - LOAD_SYM(cuCtxGetDevice, 1); - LOAD_SYM(cuLaunchKernel, 1); + LOAD_SYM(cuGetErrorString, 6000, 0); + LOAD_SYM(cuGetErrorName, 6000, 0); + LOAD_SYM(cuDeviceGet, 2000, 0); + LOAD_SYM(cuDeviceGetAttribute, 2000, 0); + LOAD_SYM(cuMemGetAddressRange, 3020, 1); + LOAD_SYM(cuCtxCreate, 11040, 1); + LOAD_SYM(cuCtxDestroy, 4000, 1); + LOAD_SYM(cuCtxGetCurrent, 4000, 1); + LOAD_SYM(cuCtxSetCurrent, 4000, 1); + LOAD_SYM(cuCtxGetDevice, 2000, 1); + LOAD_SYM(cuLaunchKernel, 4000, 1); #if CUDA_VERSION >= 11080 - LOAD_SYM(cuLaunchKernelEx, 1); + LOAD_SYM(cuLaunchKernelEx, 11060, 1); #endif /* cuMem API support */ - LOAD_SYM(cuMemAddressReserve, 1); - LOAD_SYM(cuMemAddressFree, 1); - LOAD_SYM(cuMemCreate, 1); - LOAD_SYM(cuMemGetAllocationGranularity, 1); - LOAD_SYM(cuMemExportToShareableHandle, 1); - LOAD_SYM(cuMemImportFromShareableHandle, 1); - LOAD_SYM(cuMemMap, 1); - LOAD_SYM(cuMemRelease, 1); - LOAD_SYM(cuMemRetainAllocationHandle, 1); - LOAD_SYM(cuMemSetAccess, 1); - LOAD_SYM(cuMemUnmap, 1); - LOAD_SYM(cuMemGetAllocationPropertiesFromHandle, 1); + LOAD_SYM(cuMemAddressReserve, 10020, 1); + LOAD_SYM(cuMemAddressFree, 10020, 1); + LOAD_SYM(cuMemCreate, 10020, 1); + LOAD_SYM(cuMemGetAllocationGranularity, 10020, 1); + LOAD_SYM(cuMemExportToShareableHandle, 10020, 1); + LOAD_SYM(cuMemImportFromShareableHandle, 10020, 1); + LOAD_SYM(cuMemMap, 10020, 1); + LOAD_SYM(cuMemRelease, 10020, 1); + LOAD_SYM(cuMemRetainAllocationHandle, 11000, 1); + LOAD_SYM(cuMemSetAccess, 10020, 1); + LOAD_SYM(cuMemUnmap, 10020, 1); + LOAD_SYM(cuMemGetAllocationPropertiesFromHandle, 10020, 1); /* ncclMemAlloc/Free */ - LOAD_SYM(cuPointerGetAttribute, 1); + LOAD_SYM(cuPointerGetAttribute, 4000, 1); #if CUDA_VERSION >= 11070 - LOAD_SYM(cuMemGetHandleForAddressRange, 1); // DMA-BUF support + LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support #endif #if CUDA_VERSION >= 12010 /* NVSwitch Multicast support */ - LOAD_SYM(cuMulticastAddDevice, 1); - LOAD_SYM(cuMulticastBindMem, 1); - LOAD_SYM(cuMulticastBindAddr, 1); - LOAD_SYM(cuMulticastCreate, 1); - LOAD_SYM(cuMulticastGetGranularity, 1); - LOAD_SYM(cuMulticastUnbind, 1); + LOAD_SYM(cuMulticastAddDevice, 12010, 1); + LOAD_SYM(cuMulticastBindMem, 12010, 1); + LOAD_SYM(cuMulticastBindAddr, 12010, 1); + LOAD_SYM(cuMulticastCreate, 12010, 1); + LOAD_SYM(cuMulticastGetGranularity, 12010, 1); + LOAD_SYM(cuMulticastUnbind, 12010, 1); #endif return ncclSuccess; } diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc index 698465ca4..23bf5e125 100644 --- a/src/misc/ibvwrap.cc +++ b/src/misc/ibvwrap.cc @@ -8,7 +8,11 @@ #include #include +#ifdef NCCL_BUILD_RDMA_CORE +#include +#else #include "ibvcore.h" +#endif #include "ibvsymbols.h" static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; diff --git a/src/misc/mlx5dvsymbols.cc b/src/misc/mlx5dvsymbols.cc new file mode 100644 index 000000000..5bb4109f3 --- /dev/null +++ b/src/misc/mlx5dvsymbols.cc @@ -0,0 +1,74 @@ +#include +#include + +#include "mlx5/mlx5dvsymbols.h" + +#ifdef NCCL_BUILD_MLX5DV +/* Mlx5dv linking mode. Symbols are pointers to linked MLX5 Direct Verbs */ + +#define ASSIGN_SYM(container, symbol, name) container->name= &symbol; + +ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols) { + ASSIGN_SYM(mlx5dvSymbols, mlx5dv_is_supported, mlx5dv_internal_is_supported); + ASSIGN_SYM(mlx5dvSymbols, mlx5dv_get_data_direct_sysfs_path, mlx5dv_internal_get_data_direct_sysfs_path); + ASSIGN_SYM(mlx5dvSymbols, mlx5dv_reg_dmabuf_mr, mlx5dv_internal_reg_dmabuf_mr); + return ncclSuccess; +} + +#else +/* Mlx5dv dynamic loading mode. Symbols are loaded from shared objects. */ + +#include +#include "core.h" + +// MLX5DV Library versioning +#define MLX5DV_VERSION "MLX5_1.8" + +ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols) { + static void* mlx5dvhandle = NULL; + void* tmp; + void** cast; + + mlx5dvhandle=dlopen("libmlx5.so", RTLD_NOW); + if (!mlx5dvhandle) { + mlx5dvhandle=dlopen("libmlx5.so.1", RTLD_NOW); + if (!mlx5dvhandle) { + INFO(NCCL_INIT, "Failed to open libmlx5.so[.1]"); + goto teardown; + } + } + +#define LOAD_SYM(handle, symbol, funcptr) do { \ + cast = (void**)&funcptr; \ + tmp = dlvsym(handle, symbol, MLX5DV_VERSION); \ + if (tmp == NULL) { \ + WARN("dlvsym failed on %s - %s version %s", symbol, dlerror(), MLX5DV_VERSION); \ + goto teardown; \ + } \ + *cast = tmp; \ + } while (0) + +// Attempt to load a specific symbol version - fail silently +#define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do { \ + cast = (void**)&funcptr; \ + *cast = dlvsym(handle, symbol, version); \ + } while (0) + + LOAD_SYM(mlx5dvhandle, "mlx5dv_is_supported", mlx5dvSymbols->mlx5dv_internal_is_supported); + // Cherry-pick the mlx5dv_get_data_direct_sysfs_path API from MLX5 1.25 + LOAD_SYM_VERSION(mlx5dvhandle, "mlx5dv_get_data_direct_sysfs_path", mlx5dvSymbols->mlx5dv_internal_get_data_direct_sysfs_path, "MLX5_1.25"); + // Cherry-pick the ibv_reg_dmabuf_mr API from MLX5 1.25 + LOAD_SYM_VERSION(mlx5dvhandle, "mlx5dv_reg_dmabuf_mr", mlx5dvSymbols->mlx5dv_internal_reg_dmabuf_mr, "MLX5_1.25"); + + return ncclSuccess; + +teardown: + mlx5dvSymbols->mlx5dv_internal_is_supported = NULL; + mlx5dvSymbols->mlx5dv_internal_get_data_direct_sysfs_path = NULL; + mlx5dvSymbols->mlx5dv_internal_reg_dmabuf_mr = NULL; + + if (mlx5dvhandle != NULL) dlclose(mlx5dvhandle); + return ncclSystemError; +} + +#endif diff --git a/src/misc/mlx5dvwrap.cc b/src/misc/mlx5dvwrap.cc new file mode 100644 index 000000000..930ed5d2e --- /dev/null +++ b/src/misc/mlx5dvwrap.cc @@ -0,0 +1,75 @@ +/************************************************************************* + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "mlx5/mlx5dvwrap.h" +#include +#include + +#ifdef NCCL_BUILD_MLX5DV +#include +#else +#include "mlx5/mlx5dvcore.h" +#endif +#include "mlx5/mlx5dvsymbols.h" + +static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; +static ncclResult_t initResult; +struct ncclMlx5dvSymbols mlx5dvSymbols; + +ncclResult_t wrap_mlx5dv_symbols(void) { + pthread_once(&initOnceControl, + [](){ initResult = buildMlx5dvSymbols(&mlx5dvSymbols); }); + return initResult; +} + +/* CHECK_NOT_NULL: helper macro to check for NULL symbol */ +#define CHECK_NOT_NULL(container, internal_name) \ + if (container.internal_name == NULL) { \ + WARN("lib wrapper not initialized."); \ + return ncclInternalError; \ + } + +#define MLX5DV_PTR_CHECK_ERRNO(container, internal_name, call, retval, error_retval, name) \ + CHECK_NOT_NULL(container, internal_name); \ + retval = container.call; \ + if (retval == error_retval) { \ + WARN("Call to " name " failed with error %s", strerror(errno)); \ + return ncclSystemError; \ + } \ + return ncclSuccess; + +#define MLX5DV_INT_CHECK_RET_ERRNO(container, internal_name, call, success_retval, name) \ + CHECK_NOT_NULL(container, internal_name); \ + int ret = container.call; \ + if (ret != success_retval) { \ + INFO(NCCL_NET, "Call to " name " failed with error %s errno %d", strerror(ret), ret); \ + return ncclSystemError; \ + } \ + return ncclSuccess; + +bool wrap_mlx5dv_is_supported(struct ibv_device *device) { + if (mlx5dvSymbols.mlx5dv_internal_is_supported == NULL) { + return 0; + } + return mlx5dvSymbols.mlx5dv_internal_is_supported(device); +} + +ncclResult_t wrap_mlx5dv_get_data_direct_sysfs_path(struct ibv_context *context, char *buf, size_t buf_len) { + MLX5DV_INT_CHECK_RET_ERRNO(mlx5dvSymbols, mlx5dv_internal_get_data_direct_sysfs_path, mlx5dv_internal_get_data_direct_sysfs_path(context, buf, buf_len), 0, "mlx5dv_get_data_direct_sysfs_path"); +} + +/* DMA-BUF support */ +ncclResult_t wrap_mlx5dv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access) { + MLX5DV_PTR_CHECK_ERRNO(mlx5dvSymbols, mlx5dv_internal_reg_dmabuf_mr, mlx5dv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access), *ret, NULL, "mlx5dv_reg_dmabuf_mr"); +} + +struct ibv_mr * wrap_direct_mlx5dv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access) { + if (mlx5dvSymbols.mlx5dv_internal_reg_dmabuf_mr == NULL) { + errno = EOPNOTSUPP; // ncclIbDmaBufSupport() requires this errno being set + return NULL; + } + return mlx5dvSymbols.mlx5dv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access); +} \ No newline at end of file diff --git a/src/misc/socket.cc b/src/misc/socket.cc index 731dbcee1..278fb5c51 100644 --- a/src/misc/socket.cc +++ b/src/misc/socket.cc @@ -68,7 +68,8 @@ static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, i return ncclSuccess; } else { char line[SOCKET_NAME_MAXLEN+1]; - WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0)); + WARN("socketProgress: Connection closed by remote peer %s", + ncclSocketToString(&sock->addr, line, /*numericHostForm*/0)); return ncclRemoteError; } } @@ -86,17 +87,22 @@ static ncclResult_t socketWait(int op, struct ncclSocket* sock, void* ptr, int s * Output: "IPv4/IPv6 address" */ const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) { - if (buf == NULL || addr == NULL) return NULL; - const struct sockaddr *saddr = &addr->sa; - if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; } + const struct sockaddr *saddr; char host[NI_MAXHOST], service[NI_MAXSERV]; + int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0); + if (buf == NULL || addr == NULL) goto fail; + saddr = &addr->sa; + if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) goto fail; /* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned. * (When not set, this will still happen in case the node's name cannot be determined.) */ - int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0); - (void) getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag); + if (getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag)) goto fail; sprintf(buf, "%s<%s>", host, service); return buf; +fail: + if (buf) + buf[0] = '\0'; + return buf; } static uint16_t socketToPort(union ncclSocketAddress *addr) { @@ -120,7 +126,8 @@ static int envSocketFamily(void) { return family; } -static int findInterfaces(const char* prefixList, char* names, union ncclSocketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) { +static ncclResult_t findInterfaces(const char* prefixList, char* names, union ncclSocketAddress *addrs, int sock_family, + int maxIfNameSize, int maxIfs, int* found) { #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN+1]; #endif @@ -131,10 +138,10 @@ static int findInterfaces(const char* prefixList, char* names, union ncclSocketA if (searchExact) prefixList++; int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS); - int found = 0; + *found = 0; struct ifaddrs *interfaces, *interface; - getifaddrs(&interfaces); - for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) { + SYSCHECK(getifaddrs(&interfaces), "getifaddrs"); + for (interface = interfaces; interface && *found < maxIfs; interface = interface->ifa_next) { if (interface->ifa_addr == NULL) continue; /* We only support IPv4 & IPv6 */ @@ -162,23 +169,23 @@ static int findInterfaces(const char* prefixList, char* names, union ncclSocketA // Check that this interface has not already been saved // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link bool duplicate = false; - for (int i = 0; i < found; i++) { + for (int i = 0; i < *found; i++) { if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; } } if (!duplicate) { // Store the interface name - strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize); + strncpy(names + (*found)*maxIfNameSize, interface->ifa_name, maxIfNameSize); // Store the IP address int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); - memset(addrs+found, '\0', sizeof(*addrs)); - memcpy(addrs+found, interface->ifa_addr, salen); - found++; + memset(addrs + *found, '\0', sizeof(*addrs)); + memcpy(addrs + *found, interface->ifa_addr, salen); + (*found)++; } } freeifaddrs(interfaces); - return found; + return ncclSuccess; } static bool matchSubnet(struct ifaddrs local_if, union ncclSocketAddress* remote) { @@ -219,20 +226,21 @@ static bool matchSubnet(struct ifaddrs local_if, union ncclSocketAddress* remote same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id); return same; } else { - WARN("Net : Unsupported address family type"); + INFO(NCCL_NET, "Net : Unsupported address family type"); return false; } } -int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) { +ncclResult_t ncclFindInterfaceMatchSubnet(char* ifName, union ncclSocketAddress* localAddr, + union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int* found) { #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN+1]; -#endif char line_a[SOCKET_NAME_MAXLEN+1]; - int found = 0; +#endif + *found = 0; struct ifaddrs *interfaces, *interface; - getifaddrs(&interfaces); - for (interface = interfaces; interface && !found; interface = interface->ifa_next) { + SYSCHECK(getifaddrs(&interfaces), "getifaddrs"); + for (interface = interfaces; interface && !*found; interface = interface->ifa_next) { if (interface->ifa_addr == NULL) continue; /* We only support IPv4 & IPv6 */ @@ -247,21 +255,18 @@ int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAd // Store the local IP address int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); - memcpy(localAddrs+found, interface->ifa_addr, salen); + memcpy(localAddr, interface->ifa_addr, salen); // Store the interface name - strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize); + strncpy(ifName, interface->ifa_name, ifNameMaxSize); - TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, ncclSocketToString(localAddrs+found, line), ncclSocketToString(remoteAddr, line_a)); - found++; - if (found == maxIfs) break; + TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", + interface->ifa_name, ncclSocketToString(localAddr, line), ncclSocketToString(remoteAddr, line_a)); + *found = 1; } - if (found == 0) { - WARN("Net : No interface found in the same subnet as remote address %s", ncclSocketToString(remoteAddr, line_a)); - } freeifaddrs(interfaces); - return found; + return ncclSuccess; } ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair) { @@ -344,40 +349,41 @@ ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char return ncclSuccess; } -int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) { +ncclResult_t ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs, + int* nIfs) { static int shownIfName = 0; - int nIfs = 0; // Allow user to force the INET socket family selection int sock_family = envSocketFamily(); // User specified interface const char* env = ncclGetEnv("NCCL_SOCKET_IFNAME"); + *nIfs = 0; if (env && strlen(env) > 1) { INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env); // Specified by user : find or fail if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env); - nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); + NCCLCHECK(findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs)); } else { // Try to automatically pick the right one // Start with IB - nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); + NCCLCHECK(findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs)); // else see if we can get some hint from COMM ID - if (nIfs == 0) { + if (*nIfs == 0) { const char* commId = ncclGetEnv("NCCL_COMM_ID"); if (commId && strlen(commId) > 1) { INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId); // Try to find interface that is in the same subnet as the IP in comm id union ncclSocketAddress idAddr; - ncclSocketGetAddrFromString(&idAddr, commId); - nIfs = ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs); + NCCLCHECK(ncclSocketGetAddrFromString(&idAddr, commId)); + NCCLCHECK(ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, nIfs)); } } // Then look for anything else (but not docker or lo) - if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); + if (*nIfs == 0) NCCLCHECK(findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs)); // Finally look for docker, then lo. - if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); - if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); + if (*nIfs == 0) NCCLCHECK(findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs)); + if (*nIfs == 0) NCCLCHECK(findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs)); } - return nIfs; + return ncclSuccess; } ncclResult_t ncclSocketListen(struct ncclSocket* sock) { @@ -439,17 +445,20 @@ static ncclResult_t socketTryAccept(struct ncclSocket* sock) { /* per accept's man page, for linux sockets, the following errors might be already pending errors * and should be considered as EAGAIN. To avoid infinite loop in case of errors, we use the retry count*/ if (++sock->errorRetries == ncclParamRetryCnt()) { - WARN("socketTryAccept: exceeded error retry count (%d), %s", sock->errorRetries, strerror(errno)); + WARN("socketTryAccept: exceeded error retry count after %d attempts, %s", sock->errorRetries, strerror(errno)); return ncclSystemError; } - INFO(NCCL_ALL, "Call to accept returned %s, retrying", strerror(errno)); - } else if (errno != EAGAIN && errno != EWOULDBLOCK) { + INFO(NCCL_NET|NCCL_INIT, "Call to accept returned %s, retrying", strerror(errno)); + } else if (errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) { WARN("socketTryAccept: Accept failed: %s", strerror(errno)); return ncclSystemError; } return ncclSuccess; } +NCCL_PARAM(SocketMaxRecvBuff, "SOCKET_RCVBUF", -1); +NCCL_PARAM(SocketMaxSendBuff, "SOCKET_SNDBUF", -1); + static ncclResult_t socketSetFlags(struct ncclSocket* sock) { const int one = 1; /* Set socket as non-blocking if async or if we need to be able to abort */ @@ -458,34 +467,55 @@ static ncclResult_t socketSetFlags(struct ncclSocket* sock) { SYSCHECK(flags = fcntl(sock->fd, F_GETFL), "fcntl"); SYSCHECK(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl"); } - SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt"); + SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt TCP NODELAY"); + // setsockopt should not fail even if the sizes are too large, do not change the default if unset by the user (=-1) + int rcvBuf = ncclParamSocketMaxRecvBuff(), sndBuf = ncclParamSocketMaxSendBuff(); + if (sndBuf > 0) SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (char*)&sndBuf, sizeof(int)), "setsockopt SO_SNDBUF"); + if (rcvBuf > 0) SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (char*)&rcvBuf, sizeof(int)), "setsockopt SO_RCVBUF"); return ncclSuccess; } +static void socketResetAccept(struct ncclSocket* sock) { + char line[SOCKET_NAME_MAXLEN+1]; + INFO(NCCL_NET|NCCL_INIT, "socketFinalizeAccept: didn't receive a valid magic from %s", + ncclSocketToString(&sock->addr, line)); + // Ignore spurious connection and accept again + (void)close(sock->fd); + sock->fd = -1; + sock->state = ncclSocketStateAccepting; + sock->finalizeCounter = 0; +} + static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) { uint64_t magic; enum ncclSocketType type; int received; + char line[SOCKET_NAME_MAXLEN+1]; // once accepted, linux sockets do NOT inherit file status flags such as O_NONBLOCK (BSD ones do) NCCLCHECK(socketSetFlags(sock)); if (sock->asyncFlag == 0 || sock->finalizeCounter < sizeof(magic)) { if (sock->asyncFlag == 0) { received = 0; - NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received)); + if (socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received) != ncclSuccess) { + socketResetAccept(sock); + return ncclSuccess; + } } else { + int closed = 0; received = sock->finalizeCounter; - NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(magic), &received)); + NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(magic), &received, &closed)); sock->finalizeCounter = received; - if (received < sizeof(magic)) return ncclSuccess; + if (received < sizeof(magic)) { + if (closed) { + socketResetAccept(sock); + } + return ncclSuccess; + } memcpy(&magic, sock->finalizeBuffer, sizeof(magic)); } if (magic != sock->magic) { - WARN("socketFinalizeAccept: wrong magic %lx != %lx", magic, sock->magic); - close(sock->fd); - sock->fd = -1; - // Ignore spurious connection and accept again - sock->state = ncclSocketStateAccepting; + socketResetAccept(sock); return ncclSuccess; } } @@ -500,7 +530,7 @@ static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) { memcpy(&type, sock->finalizeBuffer, sizeof(type)); } if (type != sock->type) { - WARN("socketFinalizeAccept: wrong type %d != %d", type, sock->type); + WARN("socketFinalizeAccept from %s: wrong type %d != %d", ncclSocketToString(&sock->addr, line), type, sock->type); sock->state = ncclSocketStateError; close(sock->fd); sock->fd = -1; @@ -532,32 +562,38 @@ static ncclResult_t socketResetFd(struct ncclSocket* sock) { } goto exit; } + static ncclResult_t socketConnectCheck(struct ncclSocket* sock, int errCode, const char funcName[]) { + char line[SOCKET_NAME_MAXLEN+1]; if (errCode == 0) { sock->state = ncclSocketStateConnected; } else if (errCode == EINPROGRESS) { sock->state = ncclSocketStateConnectPolling; - } else if (errCode == ETIMEDOUT || errCode == EHOSTUNREACH || errCode == ECONNREFUSED) { + } else if (errCode == EINTR || errCode == EWOULDBLOCK || errCode == EAGAIN || errCode == ETIMEDOUT || + errCode == EHOSTUNREACH || errCode == ECONNREFUSED) { if (sock->customRetry == 0) { if (sock->errorRetries++ == ncclParamRetryCnt()) { sock->state = ncclSocketStateError; - WARN("%s: connect returned %s, exceeded error retry count (%d)", funcName, strerror(errCode), sock->errorRetries); + WARN("%s: connect to %s returned %s, exceeded error retry count after %d attempts", + funcName, ncclSocketToString(&sock->addr, line), strerror(errCode), sock->errorRetries); return ncclRemoteError; } unsigned int sleepTime = sock->errorRetries * ncclParamRetryTimeOut(); - INFO(NCCL_ALL, "%s: connect returned %s, retrying (%d/%ld) after sleep for %u msec", funcName, strerror(errCode), sock->errorRetries, ncclParamRetryCnt(), sleepTime); + INFO(NCCL_NET|NCCL_INIT, "%s: connect to %s returned %s, retrying (%d/%ld) after sleep for %u msec", + funcName, ncclSocketToString(&sock->addr, line), strerror(errCode), + sock->errorRetries, ncclParamRetryCnt(), sleepTime); msleep(sleepTime); } NCCLCHECK(socketResetFd(sock)); /* in case of failure in connect, socket state is unspecified */ sock->state = ncclSocketStateConnecting; } else { - char line[SOCKET_NAME_MAXLEN+1]; sock->state = ncclSocketStateError; - WARN("%s: Connect to %s failed : %s", funcName, ncclSocketToString(&sock->addr, line), strerror(errCode)); + WARN("%s: connect to %s failed : %s", funcName, ncclSocketToString(&sock->addr, line), strerror(errCode)); return ncclSystemError; } return ncclSuccess; } + static ncclResult_t socketStartConnect(struct ncclSocket* sock) { /* blocking/non-blocking connect() is determined by asyncFlag. */ int ret = connect(sock->fd, &sock->addr.sa, sock->salen); @@ -568,6 +604,7 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) { struct pollfd pfd; int timeout = 1, ret; socklen_t rlen = sizeof(int); + char line[SOCKET_NAME_MAXLEN+1]; memset(&pfd, 0, sizeof(struct pollfd)); pfd.fd = sock->fd; @@ -577,10 +614,7 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) { if (ret == 0 || (ret < 0 && errno == EINTR)) { return ncclSuccess; } else if (ret < 0) { - WARN("socketPollConnect poll() failed with error %s", strerror(errno)); - return ncclRemoteError; - } else if (ret != 1 || (pfd.revents & POLLOUT) == 0) { - WARN("socketPollConnect poll() returned %d%s", ret, (pfd.revents & POLLOUT) ? "" : ", no POLLOUT events"); + WARN("socketPollConnect to %s failed with error %s", ncclSocketToString(&sock->addr, line), strerror(errno)); return ncclSystemError; } @@ -899,7 +933,7 @@ ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how) { if (sock != NULL) { if (sock->fd >= 0) { - shutdown(sock->fd, how); + SYSCHECK(shutdown(sock->fd, how), "shutdown"); } sock->state = ncclSocketStateTerminating; } @@ -921,8 +955,8 @@ ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait) { * by refcount of fd, but close() is. close() won't close a fd and send FIN packet if * the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful * connection close here. */ - shutdown(sock->fd, SHUT_RDWR); - close(sock->fd); + (void)shutdown(sock->fd, SHUT_RDWR); + (void)close(sock->fd); } sock->state = ncclSocketStateClosed; sock->fd = -1; diff --git a/src/misc/strongstream.cc b/src/misc/strongstream.cc index 7d957d432..0adb4b137 100644 --- a/src/misc/strongstream.cc +++ b/src/misc/strongstream.cc @@ -9,6 +9,12 @@ #include "checks.h" #include "param.h" +#if CUDART_VERSION >= 13000 +#define cudaStreamGetCaptureInfo_v3 cudaStreamGetCaptureInfo +#define cudaGraphAddDependencies_v2 cudaGraphAddDependencies +#define cudaStreamUpdateCaptureDependencies_v2 cudaStreamUpdateCaptureDependencies +#endif + // Tracks the captured work a given graph captured identified by its graph id. struct ncclStrongStreamCapture { struct ncclStrongStreamCapture* next; @@ -89,7 +95,11 @@ ncclResult_t ncclCudaGetCapturingGraph( } else { #if CUDART_VERSION >= 11030 cudaStreamCaptureStatus status; + #if CUDART_VERSION >= 13000 + CUDACHECK(cudaStreamGetCaptureInfo_v3(stream, &status, &graph->graphId, &graph->graph, nullptr, nullptr, nullptr)); + #else CUDACHECK(cudaStreamGetCaptureInfo_v2(stream, &status, &graph->graphId, &graph->graph, nullptr, nullptr)); + #endif if (status != cudaStreamCaptureStatusActive) { graph->origin = nullptr; graph->graph = nullptr; @@ -224,7 +234,11 @@ ncclResult_t ncclStrongStreamAcquire( CUDACHECK(cudaEventRecord(scratch, graph.origin)); CUDACHECK(cudaStreamWaitEvent(cap->captureStream, scratch, 0)); CUDACHECK(cudaEventDestroy(scratch)); + #if CUDART_VERSION >= 13000 + CUDACHECK(cudaStreamUpdateCaptureDependencies_v2(cap->captureStream, nullptr, nullptr, 0, cudaStreamSetCaptureDependencies)); + #else CUDACHECK(cudaStreamUpdateCaptureDependencies(cap->captureStream, nullptr, 0, cudaStreamSetCaptureDependencies)); + #endif if (mixing && firstCapture) { CUDACHECK(cudaEventRecord(ss->serialEvent, ss->liveStream)); @@ -284,7 +298,11 @@ ncclResult_t ncclStrongStreamRelease( // Make this record order after previous record on this stream. if (cap->lastRecord != nullptr) { + #if CUDART_VERSION >= 13000 + CUDACHECK(cudaGraphAddDependencies_v2(graph.graph, &cap->lastRecord, &recordNode, nullptr, 1)); + #else CUDACHECK(cudaGraphAddDependencies(graph.graph, &cap->lastRecord, &recordNode, 1)); + #endif } cap->lastRecord = recordNode; @@ -292,7 +310,11 @@ ncclResult_t ncclStrongStreamRelease( cudaStreamCaptureStatus status; cudaGraphNode_t const* nodes; size_t count = 0; + #if CUDART_VERSION >= 13000 + cudaError_t res = cudaStreamGetCaptureInfo_v3(cap->captureStream, &status, nullptr, nullptr, &nodes, nullptr, &count); + #else cudaError_t res = cudaStreamGetCaptureInfo_v2(cap->captureStream, &status, nullptr, nullptr, &nodes, &count); + #endif #if CUDART_VERSION >= 12030 if (res == cudaErrorLossyQuery) { // CUDA is telling us the dependencies have edge annotations. @@ -308,7 +330,11 @@ ncclResult_t ncclStrongStreamRelease( else { CUDACHECK(res /* = cudaStreamGetCaptureInfo_v2(...)*/); for (int i=0; i < (int)count; i++) { + #if CUDART_VERSION >= 13000 + CUDACHECK(cudaGraphAddDependencies_v2(graph.graph, &nodes[i], &recordNode, nullptr, 1)); + #else CUDACHECK(cudaGraphAddDependencies(graph.graph, &nodes[i], &recordNode, 1)); + #endif } } @@ -339,7 +365,11 @@ ncclResult_t ncclStreamAdvanceToEvent(struct ncclCudaGraph g, cudaStream_t s, cu cudaStreamCaptureStatus status; cudaGraphNode_t const* nodes; size_t count = 0; + #if CUDART_VERSION >= 13000 + cudaError_t res = cudaStreamGetCaptureInfo_v3(tmp, &status, nullptr, nullptr, &nodes, nullptr, &count); + #else cudaError_t res = cudaStreamGetCaptureInfo_v2(tmp, &status, nullptr, nullptr, &nodes, &count); + #endif #if CUDART_VERSION >= 12030 if (res == cudaErrorLossyQuery) { // CUDA is telling us the dependencies have edge annotations. @@ -352,7 +382,11 @@ ncclResult_t ncclStreamAdvanceToEvent(struct ncclCudaGraph g, cudaStream_t s, cu #endif else { CUDACHECK(res /* = cudaStreamGetCaptureInfo_v2(...)*/); + #if CUDART_VERSION >= 13000 + CUDACHECK(cudaStreamUpdateCaptureDependencies_v2(s, (cudaGraphNode_t*)nodes, nullptr, count, cudaStreamSetCaptureDependencies)); + #else CUDACHECK(cudaStreamUpdateCaptureDependencies(s, (cudaGraphNode_t*)nodes, count, cudaStreamSetCaptureDependencies)); + #endif } CUDACHECK(cudaStreamDestroy(tmp)); diff --git a/src/mnnvl.cc b/src/mnnvl.cc index 07e8b21d9..34a18b80a 100644 --- a/src/mnnvl.cc +++ b/src/mnnvl.cc @@ -58,7 +58,12 @@ ncclResult_t ncclMnnvlCheck(struct ncclComm* comm) { // Allocate FABRIC handle compatible memory ncclResult_t ret = ncclCuMemAlloc(&ptr, &handle, CU_MEM_HANDLE_TYPE_FABRIC, CUDA_IPC_MIN); - if (ret != ncclSuccess) return ncclSuccess; + if (ret != ncclSuccess) { + // Return an error if this is a MNNVL capable system but FABRIC handles are not supported + WARN("MNNVL (cliqueSize %d) is available but not working on this system. Check the IMEX channel configuration (/dev/nvidia-caps-imex-channels). Set NCCL_MNNVL_ENABLE=0 to ignore this issue.", + comm->clique.size); + return ncclSystemError; + } err = CUPFN(cuMemExportToShareableHandle(&cuDesc, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0)); if (err != CUDA_SUCCESS || (err = CUPFN(cuMemImportFromShareableHandle(&handle, &cuDesc, CU_MEM_HANDLE_TYPE_FABRIC))) != CUDA_SUCCESS) { @@ -66,7 +71,7 @@ ncclResult_t ncclMnnvlCheck(struct ncclComm* comm) { (void) pfn_cuGetErrorString(err, &errStr); NCCLCHECK(ncclCuMemFree(ptr)); // Return an error if this is a MNNVL capable system but it's not working - WARN("MNNVL (cliqueSize %d) is available but not supported on this system. Check the IMEX configuration.", + WARN("MNNVL (cliqueSize %d) is available but not working on this system. Check the IMEX configuration (nvidia-imex-ctl -N). Set NCCL_MNNVL_ENABLE=0 to ignore this issue.", comm->clique.size); return ncclSystemError; } diff --git a/src/nccl.h.in b/src/nccl.h.in index f3ab5344f..292a83914 100644 --- a/src/nccl.h.in +++ b/src/nccl.h.in @@ -31,6 +31,7 @@ extern "C" { #include /* Opaque handle to communicator */ typedef struct ncclComm* ncclComm_t; +typedef struct ncclWindow* ncclWindow_t; #define NCCL_COMM_NULL NULL #define NCCL_UNIQUE_ID_BYTES 128 @@ -52,9 +53,21 @@ typedef enum { ncclSuccess = 0, #define NCCL_SPLIT_NOCOLOR -1 #define NCCL_UNDEF_FLOAT -1.0f +/* Window Registration flags */ +#define NCCL_WIN_DEFAULT 0x00 +#define NCCL_WIN_COLL_SYMMETRIC 0x01 + +/* NCCL performance policy */ +#define NCCL_CTA_POLICY_DEFAULT 0x00 +#define NCCL_CTA_POLICY_EFFICIENCY 0x01 + +/* ncclCommShrink flags*/ +#define NCCL_SHRINK_DEFAULT 0x00 /* shrink the parent communicator */ +#define NCCL_SHRINK_ABORT 0x01 /* First, terminate ongoing parent operations, and then shrink the parent communicator */ + /* Communicator configuration. Users can assign value to attributes to specify the * behavior of a communicator. */ -typedef struct ncclConfig_v21700 { +typedef struct ncclConfig_v22700 { /* attributes that users should never touch. */ size_t size; unsigned int magic; @@ -67,6 +80,11 @@ typedef struct ncclConfig_v21700 { const char *netName; int splitShare; int trafficClass; + const char *commName; + int collnetEnable; + int CTAPolicy; + int shrinkShare; + int nvlsCTAs; } ncclConfig_t; /* Config initializer must be assigned to initialize config structure when it is created. @@ -82,6 +100,11 @@ typedef struct ncclConfig_v21700 { NCCL_CONFIG_UNDEF_PTR, /* netName */ \ NCCL_CONFIG_UNDEF_INT, /* splitShare */ \ NCCL_CONFIG_UNDEF_INT, /* trafficClass */ \ + NCCL_CONFIG_UNDEF_PTR, /* commName */ \ + NCCL_CONFIG_UNDEF_INT, /* collnetEnable */ \ + NCCL_CONFIG_UNDEF_INT, /* CTAPolicy */ \ + NCCL_CONFIG_UNDEF_INT, /* shrinkShare */ \ + NCCL_CONFIG_UNDEF_INT, /* nvlsCTAs */ \ } /* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */ @@ -173,6 +196,14 @@ ncclResult_t pncclCommAbort(ncclComm_t comm); ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); +/* Shrink existing communicator. + * Ranks in excludeRanksList will be removed form the existing communicator. + * Within the new communicator, ranks will be re-ordered to fill the gap of removed ones. + * If config is NULL, the new communicator will inherit the original communicator's configuration + * The flag enables NCCL to adapt to various states of the parent communicator, see NCCL_SHRINK flags.*/ +ncclResult_t ncclCommShrink(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags); +ncclResult_t pncclCommShrink(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags); + /* Creates a new communicator (multi thread/process version), similar to ncclCommInitRankConfig. * Allows to use more than one ncclUniqueId (up to one per rank), indicated by nId, to accelerate the init operation. * The number of ncclUniqueIds and their order must be the same for every rank. @@ -216,6 +247,14 @@ ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, v ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle); ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle); +/* Register memory window */ +ncclResult_t ncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags); +ncclResult_t pncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags); + +/* Deregister symmetric memory */ +ncclResult_t ncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win); +ncclResult_t pncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win); + /* Reduction operation selector */ typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t; typedef enum { ncclSum = 0, diff --git a/src/plugin/net.cc b/src/plugin/net.cc index 9257d7786..78944106a 100644 --- a/src/plugin/net.cc +++ b/src/plugin/net.cc @@ -8,6 +8,7 @@ #include "bootstrap.h" #include "checks.h" #include "plugin.h" +#include "nccl_net.h" #include #include @@ -15,137 +16,100 @@ //#include //#include -extern ncclNet_t* getNcclNet_v6(void* netPluginLib); -extern ncclNet_t* getNcclNet_v7(void* netPluginLib); -extern ncclNet_t* getNcclNet_v8(void* netPluginLib); -extern ncclNet_t* getNcclNet_v9(void* netPluginLib); -extern ncclNet_t* getNcclNet_v10(void* netPluginLib); - -extern ncclCollNet_t* getNcclCollNet_v6(void* netPluginLib); -extern ncclCollNet_t* getNcclCollNet_v7(void* netPluginLib); -extern ncclCollNet_t* getNcclCollNet_v8(void* netPluginLib); -extern ncclCollNet_t* getNcclCollNet_v9(void* netPluginLib); -extern ncclCollNet_t* getNcclCollNet_v10(void* netPluginLib); - -static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER; -ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket }; -static int ncclNetsVer[NCCL_NET_MAX_PLUGINS] = { -1, 10, 10 }; -ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr }; -enum ncclNetState { - ncclNetStateInit = 0, - ncclNetStateEnabled = 1, - ncclNetStateDisabled = 2 -}; -enum ncclNetState ncclNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; -enum ncclNetState ncclCollNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit }; +typedef ncclNet_t* getNcclNet_t(void* netPluginLib); +typedef ncclCollNet_t* getNcclCollNet_t(void* netPluginLib); + +extern getNcclNet_t getNcclNet_v6; +extern getNcclNet_t getNcclNet_v7; +extern getNcclNet_t getNcclNet_v8; +extern getNcclNet_t getNcclNet_v9; +extern getNcclNet_t getNcclNet_v10; +extern getNcclCollNet_t getNcclCollNet_v6; +extern getNcclCollNet_t getNcclCollNet_v7; +extern getNcclCollNet_t getNcclCollNet_v8; +extern getNcclCollNet_t getNcclCollNet_v9; +extern getNcclCollNet_t getNcclCollNet_v10; NCCL_PARAM(NetPluginRefCount, "NET_PLUGIN_REF_COUNT", 1); +#define NCCL_NET_VERSION_COUNT 5 +int ncclNetVersion[NCCL_NET_VERSION_COUNT] = {10, 9, 8, 7, 6}; +getNcclNet_t* getNcclNet[NCCL_NET_VERSION_COUNT] = {getNcclNet_v10, getNcclNet_v9, getNcclNet_v8, getNcclNet_v7, getNcclNet_v6}; +getNcclCollNet_t* getNcclCollNet[NCCL_NET_VERSION_COUNT] = {getNcclCollNet_v10, getNcclCollNet_v9, getNcclCollNet_v8, getNcclCollNet_v7, getNcclCollNet_v6}; + +#define NCCL_NET_NUM_INTERNAL_PLUGINS 2 + +typedef enum ncclNetPluginState { + ncclNetPluginStateDisabled = -2, // Plugin library failed to initialize + ncclNetPluginStateLoadFailed = -1, // Plugin library failed to load + ncclNetPluginStateLoadReady = 0, // Plugin library is ready to be loaded + ncclNetPluginStateInitReady = 1, // Plugin library is loaded and ready to be initialized + ncclNetPluginStateEnabled = 2, // Plugin library is loaded and initialized +} ncclNetPluginState_t; + +#define MAX_STR_LEN 255 +typedef struct netPluginLib { + char name[MAX_STR_LEN]; // Name of the plugin library + void* dlHandle; // Handle to the plugin library + ncclNet_t* ncclNet; // Pointer to the ncclNet_t structure + int ncclNetVer; // Version of the nccl net plugin + ncclCollNet_t* ncclCollNet; // Pointer to the ncclCollNet_t structure + ncclNetPluginState_t ncclNetPluginState; // State of the nccl net plugin + ncclNetPluginState_t ncclCollNetPluginState; // State of the nccl coll net plugin + int ncclNetPluginRefCount; // Reference count for the nccl net plugin +} netPluginLib_t; + +int pluginCount = 0; +bool netPluginLibsInitialized = false; +netPluginLib_t netPluginLibs[NCCL_NET_MAX_PLUGINS] = { 0 }; static pthread_mutex_t netPluginLock = PTHREAD_MUTEX_INITIALIZER; -static void* netPluginLib; - -static int netPluginRefCount; -static void initNetPluginRefCountOnce(void) { netPluginRefCount = ncclParamNetPluginRefCount();} +static pthread_once_t initPluginLibsOnceControl = PTHREAD_ONCE_INIT; -enum { - netPluginLoadFailed = -1, - netPluginLoadReady = 0, - netPluginLoadSuccess = 1, -}; - -static int netPluginStatus = netPluginLoadReady; +static ncclResult_t ncclNetPluginUnload(netPluginLib_t* pluginLib) { + if ((pluginLib->dlHandle) && ((pluginLib->ncclNetPluginRefCount) == 0)) { + INFO(NCCL_INIT|NCCL_NET, "Unloading plugin %s", pluginLib->name); + NCCLCHECK(ncclClosePluginLib(pluginLib->dlHandle)); + memset(pluginLib, 0, sizeof(netPluginLib_t)); + } + return ncclSuccess; +} -ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) { - static pthread_once_t netPluginRefCountOnce = PTHREAD_ONCE_INIT; - pthread_once(&netPluginRefCountOnce, initNetPluginRefCountOnce); +static ncclResult_t ncclNetPluginLoad(netPluginLib_t* pluginLib) { + pluginLib->dlHandle = ncclOpenNetPluginLib(pluginLib->name); - pthread_mutex_lock(&netPluginLock); - if (netPluginLoadFailed == netPluginStatus) { - goto exit; - } - if (netPluginLoadSuccess == netPluginStatus) { - ++netPluginRefCount; - goto exit; + if (pluginLib->dlHandle == nullptr) goto fail; + // load ncclNet + for (int i = 0; i < NCCL_NET_VERSION_COUNT; i++) { + pluginLib->ncclNetVer = ncclNetVersion[i]; + pluginLib->ncclNet = getNcclNet[i](pluginLib->dlHandle); + if (pluginLib->ncclNet) break; } - netPluginLib = ncclOpenNetPluginLib(ncclGetEnv("NCCL_NET_PLUGIN")); - if (netPluginLib == nullptr) { - goto fail; - } + // if we fail to find a net, exit + if (pluginLib->ncclNet == nullptr) goto fail; - ncclNets[0] = getNcclNet_v10(netPluginLib); - if (ncclNets[0]) ncclNetsVer[0] = 10; - if (ncclNets[0] == nullptr) { - // Try v9 plugin - ncclNets[0] = getNcclNet_v9(netPluginLib); - if (ncclNets[0]) ncclNetsVer[0] = 9; - } - if (ncclNets[0] == nullptr) { - // Try v8 plugin - ncclNets[0] = getNcclNet_v8(netPluginLib); - if (ncclNets[0]) ncclNetsVer[0] = 8; - } - if (ncclNets[0] == nullptr) { - // Try v7 plugin - ncclNets[0] = getNcclNet_v7(netPluginLib); - if (ncclNets[0]) ncclNetsVer[0] = 7; - } - if (ncclNets[0] == nullptr) { - // Try v6 plugin - ncclNets[0] = getNcclNet_v6(netPluginLib); - if (ncclNets[0]) ncclNetsVer[0] = 6; - } - if (ncclNets[0] == nullptr) { - goto fail; - } + pluginLib->ncclNetPluginState = ncclNetPluginStateInitReady; - // Check for CollNet - ncclCollNets[0] = getNcclCollNet_v10(netPluginLib); - if (ncclCollNets[0] == nullptr) { - ncclCollNets[0] = getNcclCollNet_v9(netPluginLib); - } - if (ncclCollNets[0] == nullptr) { - ncclCollNets[0] = getNcclCollNet_v8(netPluginLib); - } - if (ncclCollNets[0] == nullptr) { - ncclCollNets[0] = getNcclCollNet_v7(netPluginLib); - } - if (ncclCollNets[0] == nullptr) { - ncclCollNets[0] = getNcclCollNet_v6(netPluginLib); + // load ncclColNet + for (int i = 0; i < NCCL_NET_VERSION_COUNT; i++) { + pluginLib->ncclCollNet = getNcclCollNet[i](pluginLib->dlHandle); + if (pluginLib->ncclCollNet) break; } - ++netPluginRefCount; - netPluginStatus = netPluginLoadSuccess; - comm->netPluginLoaded = 1; + if (pluginLib->ncclCollNet == nullptr) + pluginLib->ncclCollNetPluginState = ncclNetPluginStateLoadFailed; + else + pluginLib->ncclCollNetPluginState = ncclNetPluginStateInitReady; + INFO(NCCL_INIT|NCCL_NET, "Successfully loaded external plugin %s", pluginLib->name); exit: - pthread_mutex_unlock(&netPluginLock); return ncclSuccess; fail: - if (netPluginLib) NCCLCHECK(ncclClosePluginLib(netPluginLib)); - netPluginStatus = netPluginLoadFailed; - goto exit; -} - -ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) { - pthread_mutex_lock(&netPluginLock); - if (comm->netPluginLoaded && 0 == (--netPluginRefCount)) { - if (ncclNets[0]) { - INFO(NCCL_NET, "NET/Plugin: Closing net plugin '%s'", ncclNets[0]->name); - } - if (ncclCollNets[0]) { - INFO(NCCL_NET, "NET/Plugin: Closing collnet plugin '%s'", ncclCollNets[0]->name); - } - NCCLCHECK(ncclClosePluginLib(netPluginLib)); - netPluginLib = nullptr; - ncclNets[0] = nullptr; - ncclCollNets[0] = nullptr; - netPluginStatus = netPluginLoadReady; - comm->netPluginLoaded = 0; - for (int i = 0; i < NCCL_NET_MAX_PLUGINS; ++i) - ncclCollNetStates[i] = ncclNetStates[i] = ncclNetStateInit; + if (pluginLib->dlHandle) { + NCCLCHECK(ncclClosePluginLib(pluginLib->dlHandle)); } - pthread_mutex_unlock(&netPluginLock); - return ncclSuccess; + pluginLib->ncclNetPluginState = ncclNetPluginStateLoadFailed; + pluginLib->ncclCollNetPluginState = ncclNetPluginStateLoadFailed; + goto exit; } ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) { @@ -172,72 +136,156 @@ ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, in return ncclSuccess; } -static ncclResult_t netGetState(int i, enum ncclNetState* state) { - pthread_mutex_lock(&netLock); - if (ncclNetStates[i] == ncclNetStateInit) { - int ndev; - if (ncclNets[i]->init(ncclDebugLog, ncclProfilerCallback) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled; - else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled; - else ncclNetStates[i] = ncclNetStateEnabled; +static ncclResult_t ncclNetPluginInit(netPluginLib_t* pluginLib) { + int ndev; + if (pluginLib->ncclNetPluginState == ncclNetPluginStateInitReady && pluginLib->ncclNet) { + if (pluginLib->ncclNet->init(ncclDebugLog, ncclProfilerCallback) != ncclSuccess) goto fail; + if (pluginLib->ncclNet->devices(&ndev) != ncclSuccess || ndev <= 0) goto fail; + } + pluginLib->ncclNetPluginState = ncclNetPluginStateEnabled; + INFO(NCCL_INIT|NCCL_NET, "Initialized NET plugin %s", pluginLib->ncclNet->name); + + if (pluginLib->ncclCollNetPluginState == ncclNetPluginStateInitReady && pluginLib->ncclCollNet) { + if (pluginLib->ncclCollNet->init(ncclDebugLog) != ncclSuccess) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled; + else if (pluginLib->ncclCollNet->devices(&ndev) != ncclSuccess || ndev <= 0) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled; + else { + pluginLib->ncclCollNetPluginState = ncclNetPluginStateEnabled; + } } - *state = ncclNetStates[i]; - pthread_mutex_unlock(&netLock); +exit: return ncclSuccess; +fail: + pluginLib->ncclNetPluginState = ncclNetPluginStateDisabled; + pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled; + goto exit; } -static ncclResult_t collNetGetState(int i, enum ncclNetState* state) { - pthread_mutex_lock(&netLock); - if (ncclCollNetStates[i] == ncclNetStateInit) { - int ndev; - if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled; - else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled; - else ncclCollNetStates[i] = ncclNetStateEnabled; +static ncclResult_t ncclNetPluginAssignToComm(struct ncclComm* comm, int pluginIndex, bool* isAssigned) { + const char* netName = comm->config.netName; + if (netName && strcasecmp(netName, netPluginLibs[pluginIndex].ncclNet->name) != 0) goto fail; + if (ncclSuccess != ncclNetCheckDeviceVersion(comm, netPluginLibs[pluginIndex].ncclNet, 0)) goto fail; + + if (netPluginLibs[pluginIndex].ncclNetPluginState >= ncclNetPluginStateEnabled) { + comm->ncclNet = netPluginLibs[pluginIndex].ncclNet; + comm->ncclNetVer = netPluginLibs[pluginIndex].ncclNetVer; + comm->netPluginIndex = pluginIndex; + netPluginLibs[pluginIndex].ncclNetPluginRefCount++; + *isAssigned = true; + INFO(NCCL_INIT|NCCL_NET, "Assigned NET plugin %s to comm", netPluginLibs[pluginIndex].ncclNet->name); + if (netPluginLibs[pluginIndex].ncclCollNetPluginState >= ncclNetPluginStateEnabled) { + comm->ncclCollNet = netPluginLibs[pluginIndex].ncclCollNet; + } } - *state = ncclCollNetStates[i]; - pthread_mutex_unlock(&netLock); +exit: return ncclSuccess; +fail: + *isAssigned = false; + netPluginLibs[pluginIndex].ncclNetPluginState = ncclNetPluginStateEnabled; + netPluginLibs[pluginIndex].ncclCollNetPluginState = ncclNetPluginStateEnabled; + goto exit; } -ncclResult_t ncclNetInit(struct ncclComm* comm) { - // Initialize main communication network - const char* netName; - bool ok = false; - - netName = comm->config.netName; - for (int i=0; i<3; i++) { - if (ncclNets[i] == nullptr) continue; - enum ncclNetState state; - NCCLCHECK(netGetState(i, &state)); - if (state != ncclNetStateEnabled) continue; - if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue; - if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) { - // Mismatched device plugin version - continue; +static ncclResult_t ncclNetPluginDisableOtherExternal(int pluginIndex) { + // Only if an external plugin is enabled, disable other external plugins + if (pluginIndex >= (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS)) return ncclSuccess; + char names[MAX_STR_LEN*(NCCL_NET_MAX_PLUGINS - NCCL_NET_NUM_INTERNAL_PLUGINS)] = { 0 }; + for (int i = 0; i < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS); i++) { + if (i != pluginIndex) { + // Append all disabled plugin names to a string + snprintf(names+strlen(names), sizeof(names)-strlen(names), (strlen(names) == 0) ? "%s" : ", %s", netPluginLibs[i].name); + netPluginLibs[i].ncclNetPluginState = ncclNetPluginStateDisabled; } + } + if(strlen(names) > 0) { + INFO(NCCL_INIT|NCCL_NET, "Disabling external plugins: %s", names); + } + return ncclSuccess; +} - comm->ncclNet = ncclNets[i]; - comm->ncclNetVer = ncclNetsVer[i]; - ok = true; - - if (ncclCollNets[i]) { - NCCLCHECK(collNetGetState(i, &state)); - if (state == ncclNetStateEnabled) { - comm->ncclCollNet = ncclCollNets[i]; +static void initPluginLibsOnceFunc() { + char* netPluginName = nullptr; + const char* defaultNetPlugin = "libnccl-net.so"; + const char* envNetPlugin = nullptr; + char* envNetPluginList = nullptr; + char* savePtr = nullptr; + int pluginCounter = 0; + + memset(netPluginLibs, 0, NCCL_NET_MAX_PLUGINS * sizeof(netPluginLib_t)); + envNetPlugin = ncclGetEnv("NCCL_NET_PLUGIN"); + if (envNetPlugin) { + envNetPluginList = strdup(envNetPlugin); + // Iterate over list until the list is empty + netPluginName = strtok_r(envNetPluginList, ",", &savePtr); + while(netPluginName) { + // We have 2 internal plugins (ib and socket) + // So, we can have at most( NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS)) in the NCCL_NET_PLUGIN list + if (pluginCounter >= (NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS))) { + INFO(NCCL_NET|NCCL_INIT,"NCCL_NET_PLUGIN list contains more than %d plugins, ignoring the rest", (NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS + 1))); + break; + } + // need to leave space for the name + "\n" + if((strlen(netPluginName)+1) <= MAX_STR_LEN) { + netPluginLibs[pluginCounter].ncclNetPluginState = ncclNetPluginStateLoadReady; + netPluginLibs[pluginCounter].ncclNetPluginRefCount = ncclParamNetPluginRefCount(); + strcpy(netPluginLibs[pluginCounter].name, netPluginName); + pluginCounter++; + } else { + INFO(NCCL_NET|NCCL_INIT,"NCCL_NET_PLUGIN list contains a plugin name %s longer than %d characters, ignoring it.", netPluginName, MAX_STR_LEN); } + netPluginName = strtok_r(nullptr, ",", &savePtr); } - break; + if (envNetPluginList) free(envNetPluginList); + } else { + // Add default net plugin + netPluginLibs[pluginCounter].ncclNetPluginState = ncclNetPluginStateLoadReady; + netPluginLibs[pluginCounter].ncclNetPluginRefCount = ncclParamNetPluginRefCount(); + strcpy(netPluginLibs[pluginCounter++].name, defaultNetPlugin); } - if (!ok) { - WARN("Error: network %s not found.", netName ? netName : ""); - return ncclInvalidUsage; + // Add 2 internal ib and socket plugins + netPluginLibs[pluginCounter].ncclNet = &ncclNetIb; + netPluginLibs[pluginCounter++].ncclNetPluginState = ncclNetPluginStateInitReady; + netPluginLibs[pluginCounter].ncclNet = &ncclNetSocket; + netPluginLibs[pluginCounter++].ncclNetPluginState = ncclNetPluginStateInitReady; + pluginCount = pluginCounter; +} + +ncclResult_t ncclNetInit(struct ncclComm* comm) { + bool ncclNetPluginInitialized = false; + pthread_once(&initPluginLibsOnceControl, initPluginLibsOnceFunc); + pthread_mutex_lock(&netPluginLock); + for (int pluginIndex = 0; pluginIndex < pluginCount; pluginIndex++) { + if ((pluginIndex < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS)) && (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateLoadReady)) { + NCCLCHECK(ncclNetPluginLoad(&netPluginLibs[pluginIndex])); + } + if (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateInitReady) { + NCCLCHECK(ncclNetPluginInit(&netPluginLibs[pluginIndex])); + } + if (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateEnabled) { + bool isAssigned = false; + NCCLCHECK(ncclNetPluginAssignToComm(comm, pluginIndex, &isAssigned)); + if (isAssigned) { + // If one external plugin is assigned to a comm, then disable all other external plugins + ncclNetPluginDisableOtherExternal(pluginIndex); + ncclNetPluginInitialized = true; + break; + } + } } - return ncclSuccess; + pthread_mutex_unlock(&netPluginLock); + if (ncclNetPluginInitialized) return ncclSuccess; + WARN("Failed to initialize any NET plugin"); + return ncclInvalidUsage; } ncclResult_t ncclNetFinalize(struct ncclComm* comm) { - comm->ncclNet = nullptr; - comm->ncclCollNet = nullptr; + int pluginIndex = comm->netPluginIndex; + pthread_mutex_lock(&netPluginLock); + netPluginLibs[pluginIndex].ncclNetPluginRefCount--; + for (int i = 0; i < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS); i++) { + NCCLCHECK(ncclNetPluginUnload(&netPluginLibs[i])); + } + pthread_mutex_unlock(&netPluginLock); return ncclSuccess; } diff --git a/src/plugin/plugin_open.cc b/src/plugin/plugin_open.cc index a43df28d3..a9c1d0dc0 100644 --- a/src/plugin/plugin_open.cc +++ b/src/plugin/plugin_open.cc @@ -23,7 +23,7 @@ enum ncclPluginType { static void *libHandles[NUM_LIBS]; static const char *pluginNames[NUM_LIBS] = { "NET", "TUNER", "PROFILER" }; static const char *pluginPrefix[NUM_LIBS] = { "libnccl-net", "libnccl-tuner", "libnccl-profiler" }; -static const char *pluginFallback[NUM_LIBS] = { "Using internal net plugin.", "Using internal tuner plugin.", "" }; +static const char *pluginFallback[NUM_LIBS] = { "", "Using internal tuner plugin.", "" }; static unsigned long subsys[NUM_LIBS] = { NCCL_INIT|NCCL_NET, NCCL_INIT|NCCL_TUNING, NCCL_INIT }; static void* tryOpenLib(char* name, int* err, char* errStr) { @@ -49,10 +49,9 @@ static void* tryOpenLib(char* name, int* err, char* errStr) { return handle; } -static void appendNameToList(char* nameList, int *nameListLen, char* name) { - snprintf(nameList, *nameListLen, " %s", name); - nameList += strlen(name) + 1; - *nameListLen -= strlen(name) + 1; +static void appendNameToList(char* nameList, int *leftChars, char* name) { + snprintf(nameList + PATH_MAX - *leftChars, *leftChars, " %s", name); + *leftChars -= strlen(name) + 1; } static void* openPluginLib(enum ncclPluginType type, const char* libName) { @@ -62,28 +61,31 @@ static void* openPluginLib(enum ncclPluginType type, const char* libName) { char eNoEntNameList[PATH_MAX] = { 0 }; if (libName && strlen(libName)) { - snprintf(libName_, MAX_STR_LEN, "%s", libName); - libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); - if (libHandles[type]) { - INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_); - return libHandles[type]; - } - if (openErr == ENOENT) { - appendNameToList(eNoEntNameList, &len, libName_); + // match names that start with 'lib' and end with '.so' + if (strlen(libName) >= strlen("libX.so") && strncmp(libName, "lib", strlen("lib")) == 0 && strncmp(libName + strlen(libName) - strlen(".so"), ".so", strlen(".so")) == 0) { + snprintf(libName_, MAX_STR_LEN, "%s", libName); + libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); + if (libHandles[type]) { + INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_); + return libHandles[type]; + } + if (openErr == ENOENT) { + appendNameToList(eNoEntNameList, &len, libName_); + } else { + INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr); + } } else { - INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr); - } - - snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName); - libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); - if (libHandles[type]) { - INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_); - return libHandles[type]; - } - if (openErr == ENOENT) { - appendNameToList(eNoEntNameList, &len, libName_); - } else { - INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr); + snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName); + libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); + if (libHandles[type]) { + INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_); + return libHandles[type]; + } + if (openErr == ENOENT) { + appendNameToList(eNoEntNameList, &len, libName_); + } else { + INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr); + } } } else { snprintf(libName_, MAX_STR_LEN, "%s.so", pluginPrefix[type]); @@ -123,12 +125,17 @@ void* ncclGetNetPluginLib(void) { } ncclResult_t ncclClosePluginLib(void* handle) { + bool found = false; for (int l=0; linit(&comm->profilerContext, &ncclProfilerEventMask); + int err = ncclProfiler->init(&comm->profilerContext, &ncclProfilerEventMask, comm->config.commName, comm->commHash, comm->nNodes, comm->nRanks, comm->rank, ncclDebugLog); if (err) { WARN("Profiler init failed with error (%d). Continue without profiler.", err); ncclProfiler = NULL; @@ -239,8 +243,6 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) { eDescr.type = ncclProfileColl; eDescr.parentObj = plan->groupEventHandle; eDescr.rank = plan->comm->rank; - eDescr.coll.name = plan->comm->commName; - eDescr.coll.commHash = plan->comm->commHash; eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func]; eDescr.coll.func = ncclFuncToString(ct->func); eDescr.coll.sendBuff = ct->sendbuff; @@ -248,7 +250,7 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) { eDescr.coll.count = ct->count; eDescr.coll.root = ct->root; eDescr.coll.datatype = ncclDatatypeToString(ct->datatype); - eDescr.coll.nMaxChannels = ct->nMaxChannels; + eDescr.coll.nChannels = ct->nChannels; eDescr.coll.nWarps = ct->nWarps; eDescr.coll.algo = ncclAlgoToString(ct->algorithm); eDescr.coll.proto = ncclProtoToString(ct->protocol); @@ -264,7 +266,7 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) { // gives the consistency. if (!plan->persistent || (__builtin_expect(ncclProfiler != NULL, 0) && plan->groupEventHandle && (ct->eActivationMask & ncclProfileKernelCh))) - plan->comm->seqNumber[ct->func]++; + __atomic_fetch_add(&plan->comm->seqNumber[ct->func], 1, __ATOMIC_RELAXED); ct = ct->next; } if (__builtin_expect(ncclProfiler != NULL, 0)) { @@ -277,13 +279,12 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) { eDescr.type = ncclProfileP2p; eDescr.parentObj = plan->groupEventHandle; eDescr.rank = plan->comm->rank; - eDescr.p2p.name = plan->comm->commName; - eDescr.p2p.commHash = plan->comm->commHash; eDescr.p2p.func = ncclFuncToString(pt->func); eDescr.p2p.buff = pt->buff; eDescr.p2p.count = pt->count; eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype); eDescr.p2p.peer = pt->root; + eDescr.p2p.nChannels = pt->nChannels; ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr); } pt = pt->next; @@ -319,7 +320,7 @@ ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) { // made of sliceSteps steps rather than one step. In the profiler we are still // interested in whole network transfers though, so we account for this when // computing the actual network step number. -ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args) { +ncclResult_t ncclProfilerStartProxyOpEvent(int s, struct ncclProxyArgs* args) { TIME_START_EVENT(proxyOpStart); struct ncclProxySubArgs* sub = &args->subs[s]; if (__builtin_expect(ncclProfiler != NULL, 0)) { @@ -333,29 +334,7 @@ ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args eDescr.proxyOp.peer = sub->peer; eDescr.proxyOp.nSteps = DIVUP(sub->nsteps, args->sliceSteps); eDescr.proxyOp.chunkSize = args->chunkSize * args->sliceSteps; - eDescr.proxyOp.isSend = 1; - ncclProfiler->startEvent(sub->profilerContext, &sub->opEventHandle, &eDescr); - } - } - TIME_STOP_EVENT(proxyOpStart); - return ncclSuccess; -} - -ncclResult_t ncclProfilerStartRecvProxyOpEvent(int s, struct ncclProxyArgs* args) { - TIME_START_EVENT(proxyOpStart); - struct ncclProxySubArgs* sub = &args->subs[s]; - if (__builtin_expect(ncclProfiler != NULL, 0)) { - if (sub->eActivationMask & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileNetPlugin)) { - ncclProfilerEventDescr_t eDescr = { 0 }; - eDescr.type = ncclProfileProxyOp; - eDescr.parentObj = sub->taskEventHandle; - eDescr.rank = sub->rank; - eDescr.proxyOp.pid = sub->pid; - eDescr.proxyOp.channelId = sub->channelId; - eDescr.proxyOp.peer = sub->peer; - eDescr.proxyOp.nSteps = DIVUP(sub->nsteps, args->sliceSteps); - eDescr.proxyOp.chunkSize = args->chunkSize * args->sliceSteps; - eDescr.proxyOp.isSend = 0; + eDescr.proxyOp.isSend = args->progress == ncclTransports[TRANSPORT_NET]->send.proxyProgress ? 1 : 0; ncclProfiler->startEvent(sub->profilerContext, &sub->opEventHandle, &eDescr); } } @@ -385,7 +364,8 @@ ncclResult_t ncclProfilerStartSendProxyStepEvent(int s, struct ncclProxyArgs* ar eDescr.parentObj = sub->opEventHandle; eDescr.rank = sub->rank; eDescr.proxyStep.step = step_; - ncclProfiler->startEvent(sub->profilerContext, &sub->stepEventHandles[step_%NCCL_STEPS], &eDescr); + ncclProfiler->startEvent(sub->profilerContext, &sub->pHandles[step_%NCCL_STEPS].stepEventHandle, &eDescr); + sub->pHandles[step_%NCCL_STEPS].subArgPtr = sub; } } TIME_STOP_EVENT(proxyStepStart); @@ -403,7 +383,8 @@ ncclResult_t ncclProfilerStartRecvProxyStepEvent(int s, struct ncclProxyArgs* ar eDescr.parentObj = sub->opEventHandle; eDescr.rank = sub->rank; eDescr.proxyStep.step = step_; - ncclProfiler->startEvent(sub->profilerContext, &sub->stepEventHandles[step_%NCCL_STEPS], &eDescr); + ncclProfiler->startEvent(sub->profilerContext, &sub->pHandles[step_%NCCL_STEPS].stepEventHandle, &eDescr); + sub->pHandles[step_%NCCL_STEPS].subArgPtr = sub; } } TIME_STOP_EVENT(proxyStepStart); @@ -415,9 +396,9 @@ ncclResult_t ncclProfilerStopProxyStepEvent(int s, struct ncclProxyArgs* args, i struct ncclProxySubArgs* sub = &args->subs[s]; if (__builtin_expect(ncclProfiler != NULL, 0)) { int step_ = DIVUP(stepId, args->sliceSteps); - if (sub->stepEventHandles[step_%NCCL_STEPS]) { - ncclProfiler->stopEvent(sub->stepEventHandles[step_%NCCL_STEPS]); - sub->stepEventHandles[step_%NCCL_STEPS] = NULL; + if (sub->pHandles[step_%NCCL_STEPS].stepEventHandle) { + ncclProfiler->stopEvent(sub->pHandles[step_%NCCL_STEPS].stepEventHandle); + sub->pHandles[step_%NCCL_STEPS].stepEventHandle = NULL; } } TIME_STOP_EVENT(proxyStepStop); @@ -451,7 +432,7 @@ ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle) { return ncclSuccess; } -ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s) { +ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t start) { if (__builtin_expect(ncclProfiler != NULL, 0)) { struct ncclProxySubArgs* sub = &args->subs[s]; if (sub->eActivationMask & ncclProfileKernelCh) { @@ -459,29 +440,31 @@ ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s) { eDescr.type = ncclProfileKernelCh; eDescr.parentObj = sub->taskEventHandle; eDescr.kernelCh.channelId = sub->channelId; + eDescr.kernelCh.pTimer = start; ncclProfiler->startEvent(sub->profilerContext, &sub->kernelEventHandle, &eDescr); } } return ncclSuccess; } -ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s) { +ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t stop) { if (__builtin_expect(ncclProfiler != NULL, 0)) { struct ncclProxySubArgs* sub = &args->subs[s]; if (sub->kernelEventHandle) { + ncclProfilerEventStateArgs_t a = { }; + a.kernelCh.pTimer = stop; + ncclProfiler->recordEventState(sub->kernelEventHandle, ncclProfilerKernelChStop, &a); ncclProfiler->stopEvent(sub->kernelEventHandle); } } return ncclSuccess; } -ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState) { +ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* args, ncclProfilerEventState_t eState) { TIME_START_EVENT(proxyOpRecord); struct ncclProxySubArgs* sub = &args->subs[s]; if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) { ncclProfilerEventStateArgs_t a = { }; - a.proxyOp.steps = DIVUP(steps, args->sliceSteps); - a.proxyOp.transSize = transSize; ncclProfiler->recordEventState(sub->opEventHandle, eState, &a); } TIME_STOP_EVENT(proxyOpRecord); @@ -493,8 +476,10 @@ ncclResult_t ncclProfilerRecordProxyStepEventState(int s, struct ncclProxyArgs* struct ncclProxySubArgs* sub = &args->subs[s]; if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) { int step_ = DIVUP(stepId, args->sliceSteps); - if (sub->stepEventHandles[step_%NCCL_STEPS]) { - ncclProfiler->recordEventState(sub->stepEventHandles[step_%NCCL_STEPS], eState, 0); + if (sub->pHandles[step_%NCCL_STEPS].stepEventHandle) { + ncclProfilerEventStateArgs_t a = { }; + a.proxyStep.transSize = sub->transSize; + ncclProfiler->recordEventState(sub->pHandles[step_%NCCL_STEPS].stepEventHandle, eState, &a); } } TIME_STOP_EVENT(proxyStepRecord); @@ -547,18 +532,28 @@ bool ncclProfilerPluginLoaded(void) { ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData) { if (__builtin_expect(ncclProfiler != NULL, 0)) { - struct ncclProxySubArgs* sub = (struct ncclProxySubArgs*)pHandle; - if (type == 0) { // start + if (type == ncclProfilerNetEventStart) { // start + struct ncclProxyEventHandle* p = (struct ncclProxyEventHandle*)pHandle; + struct ncclProxySubArgs* sub = p->subArgPtr; if (sub->eActivationMask & ncclProfileNetPlugin) { ncclProfilerEventDescr_t eDescr = { 0 }; eDescr.type = ncclProfileNetPlugin; - eDescr.parentObj = sub->stepEventHandles[sub->profilerSteps%NCCL_STEPS]; + eDescr.parentObj = p->stepEventHandle; eDescr.rank = sub->rank; eDescr.netPlugin.id = pluginId; eDescr.netPlugin.data = extData; ncclProfiler->startEvent(sub->profilerContext, eHandle, &eDescr); } - } else { // stop + } else if (type == ncclProfilerNetEventStop) { // stop + ncclProfiler->stopEvent(*eHandle); + } else if (type == ncclProfilerNetEventUpdate) { // update + ncclProfilerEventStateArgs_t args = { }; + args.netPlugin.data = extData; + ncclProfiler->recordEventState(*eHandle, ncclProfilerNetPluginUpdate, &args); + } else { // update and stop + ncclProfilerEventStateArgs_t args = { }; + args.netPlugin.data = extData; + ncclProfiler->recordEventState(*eHandle, ncclProfilerNetPluginUpdate, &args); ncclProfiler->stopEvent(*eHandle); } } diff --git a/src/plugin/profiler/profiler_v1.cc b/src/plugin/profiler/profiler_v1.cc index 139742942..2126afc68 100644 --- a/src/plugin/profiler/profiler_v1.cc +++ b/src/plugin/profiler/profiler_v1.cc @@ -53,6 +53,7 @@ static uint8_t ncclStringToDatatype(const char* dt) { } static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) { + *eHandle = NULL; ncclProfilerEventDescr_v1_t eDescr_v1 = { 0 }; eDescr_v1.type = eDescr->type; eDescr_v1.parentObj = eDescr->parentObj; @@ -60,8 +61,8 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP switch(eDescr->type) { case ncclProfileGroup: break; case ncclProfileColl: { - eDescr_v1.coll.name = eDescr->coll.name; - eDescr_v1.coll.commHash = eDescr->coll.commHash; + eDescr_v1.coll.name = nullptr; // removed in v4 + eDescr_v1.coll.commHash = 0; // removed in v4 eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber; eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func); eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff; @@ -71,14 +72,14 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP eDescr_v1.coll.datatype = ncclStringToDatatype(eDescr->coll.datatype); eDescr_v1.coll.op = 0; // removed in v2 eDescr_v1.coll.trafficBytes = 0; // removed in v3 - eDescr_v1.coll.nMaxChannels = eDescr->coll.nMaxChannels; + eDescr_v1.coll.nMaxChannels = eDescr->coll.nChannels; eDescr_v1.coll.nWarps = eDescr->coll.nWarps; eDescr_v1.coll.algo = ncclStringToAlgo(eDescr->coll.algo); eDescr_v1.coll.proto = ncclStringToProto(eDescr->coll.proto); } break; case ncclProfileP2p: { - eDescr_v1.p2p.name = eDescr->p2p.name; - eDescr_v1.p2p.commHash = eDescr->p2p.commHash; + eDescr_v1.p2p.name = nullptr; // removed in v4 + eDescr_v1.p2p.commHash = 0; // removed in v4 eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func); eDescr_v1.p2p.buff = eDescr->p2p.buff; eDescr_v1.p2p.count = eDescr->p2p.count; @@ -97,21 +98,34 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP eDescr_v1.proxyStep.step = eDescr->proxyStep.step; } break; case ncclProfileProxyCtrl: break; - case ncclProfileKernelCh: - case ncclProfileNetPlugin: { - *eHandle = NULL; - return ncclSuccess; - } - default:; + default: return ncclSuccess; } return ncclProfiler_v1->startEvent(context, eHandle, &eDescr_v1); } static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) { - return ncclProfiler_v1->recordEventState(eHandle, eState, (ncclProfilerEventStateArgs_v1_t*)eStateArgs); + ncclProfilerEventStateArgs_v1_t args = { }; + switch (eState) { + case ncclProfilerProxyCtrlIdle: + case ncclProfilerProxyCtrlActive: + case ncclProfilerProxyCtrlSleep: + case ncclProfilerProxyCtrlWakeup: + case ncclProfilerProxyCtrlAppend: + case ncclProfilerProxyCtrlAppendEnd: + args.proxyCtrl.appendedProxyOps = eStateArgs->proxyCtrl.appendedProxyOps; + break; + case ncclProfilerProxyStepSendGPUWait: + case ncclProfilerProxyStepSendWait: + case ncclProfilerProxyStepRecvWait: + case ncclProfilerProxyStepRecvFlushWait: + case ncclProfilerProxyStepRecvGPUWait: + break; + default: return ncclSuccess; + } + return ncclProfiler_v1->recordEventState(eHandle, eState, &args); } -static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask) { +static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) { NCCLCHECK(ncclProfiler_v1->init(context, eActivationMask)); ncclProfiler.startEvent = ncclProfiler_startEvent; ncclProfiler.stopEvent = ncclProfiler_v1->stopEvent; diff --git a/src/plugin/profiler/profiler_v2.cc b/src/plugin/profiler/profiler_v2.cc index 52907d6e3..11e521e90 100644 --- a/src/plugin/profiler/profiler_v2.cc +++ b/src/plugin/profiler/profiler_v2.cc @@ -20,8 +20,8 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP switch(eDescr->type) { case ncclProfileGroup: break; case ncclProfileColl: { - eDescr_v2.coll.name = eDescr->coll.name; - eDescr_v2.coll.commHash = eDescr->coll.commHash; + eDescr_v2.coll.name = nullptr; // removed in v4 + eDescr_v2.coll.commHash = 0; // removed in v4 eDescr_v2.coll.seqNumber = eDescr->coll.seqNumber; eDescr_v2.coll.func = eDescr->coll.func; eDescr_v2.coll.sendBuff = eDescr->coll.sendBuff; @@ -30,14 +30,14 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP eDescr_v2.coll.root = eDescr->coll.root; eDescr_v2.coll.datatype = eDescr->coll.datatype; eDescr_v2.coll.trafficBytes = 0; // removed in v3 - eDescr_v2.coll.nMaxChannels = eDescr->coll.nMaxChannels; + eDescr_v2.coll.nMaxChannels = eDescr->coll.nChannels; eDescr_v2.coll.nWarps = eDescr->coll.nWarps; eDescr_v2.coll.algo = eDescr->coll.algo; eDescr_v2.coll.proto = eDescr->coll.proto; } break; case ncclProfileP2p: { - eDescr_v2.p2p.name = eDescr->p2p.name; - eDescr_v2.p2p.commHash = eDescr->p2p.commHash; + eDescr_v2.p2p.name = nullptr; // removed in v4 + eDescr_v2.p2p.commHash = 0; // removed in v4 eDescr_v2.p2p.func = eDescr->p2p.func; eDescr_v2.p2p.buff = eDescr->p2p.buff; eDescr_v2.p2p.count = eDescr->p2p.count; @@ -62,10 +62,28 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP } static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) { - return ncclProfiler_v2->recordEventState(eHandle, eState, (ncclProfilerEventStateArgs_v2_t *)eStateArgs); + ncclProfilerEventStateArgs_v2_t args = { }; + switch (eState) { + case ncclProfilerProxyCtrlIdle: + case ncclProfilerProxyCtrlActive: + case ncclProfilerProxyCtrlSleep: + case ncclProfilerProxyCtrlWakeup: + case ncclProfilerProxyCtrlAppend: + case ncclProfilerProxyCtrlAppendEnd: + args.proxyCtrl.appendedProxyOps = eStateArgs->proxyCtrl.appendedProxyOps; + break; + case ncclProfilerProxyStepSendGPUWait: + case ncclProfilerProxyStepSendWait: + case ncclProfilerProxyStepRecvWait: + case ncclProfilerProxyStepRecvFlushWait: + case ncclProfilerProxyStepRecvGPUWait: + break; + default: return ncclSuccess; + } + return ncclProfiler_v2->recordEventState(eHandle, eState, &args); } -static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask) { +static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) { NCCLCHECK(ncclProfiler_v2->init(context, eActivationMask)); ncclProfiler.startEvent = ncclProfiler_startEvent; ncclProfiler.stopEvent = ncclProfiler_v2->stopEvent; diff --git a/src/plugin/profiler/profiler_v3.cc b/src/plugin/profiler/profiler_v3.cc index 322bea57a..3dba3231a 100644 --- a/src/plugin/profiler/profiler_v3.cc +++ b/src/plugin/profiler/profiler_v3.cc @@ -6,14 +6,105 @@ #include "comm.h" #include "nccl_profiler.h" +#include "checks.h" +static ncclProfiler_t ncclProfiler; static ncclProfiler_v3_t* ncclProfiler_v3; +static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) { + *eHandle = nullptr; + ncclProfilerEventDescr_v3_t eDescr_v3 = { }; + eDescr_v3.type = eDescr->type; + eDescr_v3.parentObj = eDescr->parentObj; + eDescr_v3.rank = eDescr->rank; + switch(eDescr->type) { + case ncclProfileGroup: break; + case ncclProfileColl: { + eDescr_v3.coll.name = nullptr; // removed in v4 + eDescr_v3.coll.commHash = 0; // removed in v4 + eDescr_v3.coll.seqNumber = eDescr->coll.seqNumber; + eDescr_v3.coll.func = eDescr->coll.func; + eDescr_v3.coll.sendBuff = eDescr->coll.sendBuff; + eDescr_v3.coll.recvBuff = eDescr->coll.recvBuff; + eDescr_v3.coll.count = eDescr->coll.count; + eDescr_v3.coll.root = eDescr->coll.root; + eDescr_v3.coll.datatype = eDescr->coll.datatype; + eDescr_v3.coll.nMaxChannels = eDescr->coll.nChannels; + eDescr_v3.coll.nWarps = eDescr->coll.nWarps; + eDescr_v3.coll.algo = eDescr->coll.algo; + eDescr_v3.coll.proto = eDescr->coll.proto; + } break; + case ncclProfileP2p: { + eDescr_v3.p2p.name = nullptr; // removed in v4 + eDescr_v3.p2p.commHash = 0; // removed in v4 + eDescr_v3.p2p.func = eDescr->p2p.func; + eDescr_v3.p2p.buff = eDescr->p2p.buff; + eDescr_v3.p2p.count = eDescr->p2p.count; + eDescr_v3.p2p.datatype = eDescr->p2p.datatype; + eDescr_v3.p2p.peer = eDescr->p2p.peer; + } break; + case ncclProfileProxyOp: { + eDescr_v3.proxyOp.pid = eDescr->proxyOp.pid; + eDescr_v3.proxyOp.channelId = eDescr->proxyOp.channelId; + eDescr_v3.proxyOp.peer = eDescr->proxyOp.peer; + eDescr_v3.proxyOp.nSteps = eDescr->proxyOp.nSteps; + eDescr_v3.proxyOp.chunkSize = eDescr->proxyOp.chunkSize; + eDescr_v3.proxyOp.isSend = eDescr->proxyOp.isSend; + } break; + case ncclProfileProxyStep: { + eDescr_v3.proxyStep.step = eDescr->proxyStep.step; + } break; + case ncclProfileProxyCtrl: break; + case ncclProfileKernelCh: { + eDescr_v3.kernelCh.channelId = eDescr->kernelCh.channelId; + } break; + case ncclProfileNetPlugin: { + eDescr_v3.netPlugin.id = eDescr->netPlugin.id; + eDescr_v3.netPlugin.data = eDescr->netPlugin.data; + } break; + default: return ncclSuccess; + } + return ncclProfiler_v3->startEvent(context, eHandle, &eDescr_v3); +} + +static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) { + ncclProfilerEventStateArgs_v3_t args = { }; + switch (eState) { + case ncclProfilerProxyCtrlIdle: + case ncclProfilerProxyCtrlActive: + case ncclProfilerProxyCtrlSleep: + case ncclProfilerProxyCtrlWakeup: + case ncclProfilerProxyCtrlAppend: + case ncclProfilerProxyCtrlAppendEnd: + args.proxyCtrl.appendedProxyOps = eStateArgs->proxyCtrl.appendedProxyOps; + break; + case ncclProfilerProxyStepSendGPUWait: + case ncclProfilerProxyStepSendWait: + case ncclProfilerProxyStepRecvWait: + case ncclProfilerProxyStepRecvFlushWait: + case ncclProfilerProxyStepRecvGPUWait: + break; + default: return ncclSuccess; + } + return ncclProfiler_v3->recordEventState(eHandle, eState, &args); +} + +static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) { + NCCLCHECK(ncclProfiler_v3->init(context, eActivationMask)); + ncclProfiler.startEvent = ncclProfiler_startEvent; + ncclProfiler.stopEvent = ncclProfiler_v3->stopEvent; + ncclProfiler.recordEventState = ncclProfiler_recordEventState; + ncclProfiler.finalize = ncclProfiler_v3->finalize; + return ncclSuccess; +} + ncclProfiler_t* getNcclProfiler_v3(void* lib) { ncclProfiler_v3 = (ncclProfiler_v3_t*)dlsym(lib, "ncclProfiler_v3"); if (ncclProfiler_v3) { + ncclProfiler.name = ncclProfiler_v3->name; + ncclProfiler.init = ncclProfiler_init; INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v3->name); - return ncclProfiler_v3; + return &ncclProfiler; } INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v3"); return NULL; diff --git a/src/plugin/profiler/profiler_v4.cc b/src/plugin/profiler/profiler_v4.cc new file mode 100644 index 000000000..11bed891a --- /dev/null +++ b/src/plugin/profiler/profiler_v4.cc @@ -0,0 +1,21 @@ +/************************************************************************* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "comm.h" +#include "nccl_profiler.h" +#include "checks.h" + +static ncclProfiler_v4_t* ncclProfiler_v4; + +ncclProfiler_t* getNcclProfiler_v4(void* lib) { + ncclProfiler_v4 = (ncclProfiler_v4_t*)dlsym(lib, "ncclProfiler_v4"); + if (ncclProfiler_v4) { + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v4->name); + return ncclProfiler_v4; + } + INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v4"); + return NULL; +} diff --git a/src/proxy.cc b/src/proxy.cc index c27d23455..74ec70f0e 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -416,6 +416,7 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr args->state = ncclProxyOpReady; args->progress = op->connection->tcomm->proxyProgress; args->proxyAppendPtr = op->connection->proxyAppendPtr; + if (args->pattern != ncclPatternProfiler) ncclProfilerStartProxyOpEvent(subIndex, args); return ncclSuccess; } @@ -634,10 +635,10 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool const int rank = comm->rank, nranks = comm->nRanks; int *nstepsSend = NULL, *nstepsRecv = NULL; PatRSAlgorithm algo(op->chunkSize, NCCL_STEPS, 16, 0, size, size, op->chunkSize, rank, nranks); + struct ncclPatStep ps = {0}; NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_up); NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_up); - struct ncclPatStep ps; do { algo.getNextOp(&ps); if (ps.flags & PatSkipped) continue; @@ -668,10 +669,10 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool const int rank = comm->rank, nranks = comm->nRanks; int *nstepsSend = NULL, *nstepsRecv = NULL; PatAGAlgorithm algo(op->chunkSize, NCCL_STEPS, 16, 0, size, size, op->chunkSize, rank, nranks); + struct ncclPatStep ps = {0}; NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_down); NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_down); - struct ncclPatStep ps; do { algo.getNextOp(&ps); if (ps.flags & PatSkipped) continue; @@ -933,11 +934,13 @@ void* ncclProxyProgress(void *proxyState_) { INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret); break; } - void* eHandle; - ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle); - if (lastIdle == 0 && idle == 1) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlIdle); - if (lastIdle == 1 && idle == 0) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlActive); - ncclProfilerStopProxyCtrlEvent(eHandle); + if ((lastIdle == 0 && idle == 1) || (lastIdle == 1 && idle == 0)) { + void* eHandle; + ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle); + if (lastIdle == 0 && idle == 1) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlIdle); + if (lastIdle == 1 && idle == 0) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlActive); + ncclProfilerStopProxyCtrlEvent(eHandle); + } if (idle || !state->active || (++proxyOpAppendCounter == ncclParamProgressAppendOpFreq())) { int added = 0; proxyOpAppendCounter = 0; diff --git a/src/ras/collectives.cc b/src/ras/collectives.cc index 72833604f..4f8b6efc4 100644 --- a/src/ras/collectives.cc +++ b/src/ras/collectives.cc @@ -606,6 +606,10 @@ static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqL for (int commIdx = 0; commIdx < nNcclComms; commIdx++) { if (ncclComms[commIdx] == nullptr) // nullptr's are always at the end after sorting. break; + if (!__atomic_load_n(&ncclComms[commIdx]->peerInfoValid, __ATOMIC_ACQUIRE)) { + // Critical data is not yet initialized -- ignore the communicator. + continue; + } // A process may manage multiple GPUs and thus have multiple communicators with the same commHash. // Comparing just the commHash is OK though within communicators that are part of the same process. if (commIdx == 0 || ncclComms[commIdx]->commHash != ncclComms[commIdx-1]->commHash) { @@ -651,6 +655,8 @@ static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqL // collCommIdx counts rasCollComms::comm (comm); commIdx indexes ncclComms. for (int collCommIdx = 0, commIdx = 0; collCommIdx < nComms; collCommIdx++) { struct ncclComm* ncclComm = ncclComms[commIdx]; + if (!__atomic_load_n(&ncclComm->peerInfoValid, __ATOMIC_ACQUIRE)) + continue; comm->commId.commHash = ncclComm->commHash; comm->commId.hostHash = ncclComm->peerInfo->hostHash; @@ -663,15 +669,15 @@ static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqL commIdx++) { ncclComm = ncclComms[commIdx]; struct rasCollComms::comm::rank* rank = comm->ranks+comm->nRanks; - ncclResult_t asyncError; rank->commRank = ncclComm->rank; // rasNetSendCollReq initializes coll->peers[0] to our rasNetListeningSocket.addr, so peerIdx is initially // always 0. It will increase after we send this response back to the peer we got the request from. rank->peerIdx = 0; memcpy(rank->collOpCounts, ncclComm->seqNumber, sizeof(rank->collOpCounts)); rank->status.initState = ncclComm->initState; - if (ncclCommGetAsyncError(ncclComm, &asyncError) == ncclSuccess) - rank->status.asyncError = asyncError; + rank->status.asyncError = __atomic_load_n(&ncclComm->asyncResult, __ATOMIC_ACQUIRE); + if (rank->status.asyncError == ncclSuccess && ncclComm->proxyState) + rank->status.asyncError = __atomic_load_n(&ncclComm->proxyState->asyncResult, __ATOMIC_ACQUIRE); rank->status.finalizeCalled = (ncclComm->finalizeCalled != 0); rank->status.destroyFlag = (ncclComm->destroyFlag != 0); rank->status.abortFlag = (__atomic_load_n(ncclComm->abortFlag, __ATOMIC_ACQUIRE) != 0); @@ -680,7 +686,7 @@ static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqL comm->nRanks++; } // for (commIdx) - if (firstNewSkipMissingIdx != -1 && + if (__atomic_load_n(&ncclComm->peerInfoValid, __ATOMIC_ACQUIRE) && firstNewSkipMissingIdx != -1 && memcmp(req->comms.skipMissingRanksComms+firstNewSkipMissingIdx, &comm->commId, sizeof(comm->commId)) == 0) { // Fill in the missingRanks array that follows the comm->ranks. struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks+comm->nRanks); diff --git a/src/ras/rasnet.cc b/src/ras/rasnet.cc index 43aa042a7..1194e61b5 100644 --- a/src/ras/rasnet.cc +++ b/src/ras/rasnet.cc @@ -365,15 +365,16 @@ ncclResult_t rasNetAcceptNewSocket() { NCCLCHECKGOTO(ncclSocketAccept(&sock->sock, &rasNetListeningSocket), ret, fail); NCCLCHECKGOTO(ncclSocketReady(&sock->sock, &ready), ret, fail); - if (sock->sock.fd != -1) { - NCCLCHECKGOTO(rasGetNewPollEntry(&sock->pfd), ret, fail); - rasPfds[sock->pfd].fd = sock->sock.fd; - rasPfds[sock->pfd].events = POLLIN; // Initially we'll just wait for a handshake from the other side. This also - // helps the code tell the sides apart. - sock->status = RAS_SOCK_CONNECTING; - - INFO(NCCL_RAS, "RAS new incoming socket connection from %s", ncclSocketToString(&sock->sock.addr, rasLine)); - } + if (sock->sock.fd == -1) + goto fail; // We'll return ncclSuccess, but we need to clean up the incomplete socket first. + + NCCLCHECKGOTO(rasGetNewPollEntry(&sock->pfd), ret, fail); + rasPfds[sock->pfd].fd = sock->sock.fd; + rasPfds[sock->pfd].events = POLLIN; // Initially we'll just wait for a handshake from the other side. This also + // helps the code tell the sides apart. + sock->status = RAS_SOCK_CONNECTING; + + INFO(NCCL_RAS, "RAS new incoming socket connection from %s", ncclSocketToString(&sock->sock.addr, rasLine)); exit: return ret; @@ -480,7 +481,10 @@ void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup) { // Once we get an EOF when receiving data, we finalize the termination. // For not fully established sockets, we can terminate immediately as there's no useful data to extract. void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRetryOffset, bool retry) { - assert(sock->status != RAS_SOCK_CLOSED); + if (sock->status == RAS_SOCK_CLOSED) { + INFO(NCCL_RAS, "RAS socket in closed state passed for termination -- internal error?"); + // The code below can actually handle such a case gracefully. + } if (sock->conn) { struct rasConnection* conn = sock->conn; // If the sock of the connection points back to us, it means that we are the current socket of this @@ -542,8 +546,10 @@ void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRet } else { // Either the caller requested finalization or we cannot receive on it. (void)ncclSocketClose(&sock->sock); - rasPfds[sock->pfd].fd = -1; - rasPfds[sock->pfd].events = rasPfds[sock->pfd].revents = 0; + if (sock->pfd != -1) { + rasPfds[sock->pfd].fd = -1; + rasPfds[sock->pfd].events = rasPfds[sock->pfd].revents = 0; + } free(sock->recvMsg); freeSockEntry(sock); } diff --git a/src/register/coll_reg.cc b/src/register/coll_reg.cc index 2ab7e9448..d9d9fb436 100644 --- a/src/register/coll_reg.cc +++ b/src/register/coll_reg.cc @@ -1,6 +1,7 @@ #include "register.h" #include "transport.h" #include "enqueue.h" +#include "register_inline.h" static ncclResult_t registerCheckP2PConnection(struct ncclComm* comm, struct ncclConnector* conn, struct ncclTopoGraph* graph, int peer, bool* needReg) { if (conn->connected) { @@ -61,32 +62,34 @@ ncclResult_t ncclRegisterCollNvlsBuffers( if (nvlsReged && comm->nNodes > 1 && info->algorithm == NCCL_ALGO_NVLS) { if (comm->planner.persistent && ncclParamGraphRegister()) { - ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle, cleanupQueue, &info->nCleanupQueueElts); - if (collnetReged) ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle, cleanupQueue, &info->nCleanupQueueElts); + if (info->func == ncclFuncAllGather) { + ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &collnetReged, &sendHandle, cleanupQueue, &info->nCleanupQueueElts); + } else if (info->func == ncclFuncReduceScatter) { + ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle, cleanupQueue, &info->nCleanupQueueElts); + } else if (info->func == ncclFuncAllReduce) { + ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle, cleanupQueue, &info->nCleanupQueueElts); + if (collnetReged) ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle, cleanupQueue, &info->nCleanupQueueElts); + } } if (collnetReged == 0 && ncclParamLocalRegister()) { - ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle); - if (collnetReged) ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle); + if (info->func == ncclFuncAllGather) { + ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &collnetReged, &sendHandle); + } else if (info->func == ncclFuncReduceScatter) { + ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle); + } else if (info->func == ncclFuncAllReduce) { + ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle); + if (collnetReged) ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle); + } } } if (nvlsReged) { *regNeedConnect = 0; /* tweak NVLS channels usage; for registered NVLS buffer to saturate bandwidth. */ - if (comm->nNodes == 1) { - if (info->func == ncclFuncReduceScatter) { - // RS: Further tweaks for Blackwell with NVLS registered buffers - info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 6 : 5)); - } - else { - // AR/AG: Further tweaks for Blackwell with NVLS registered buffers - info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 8 : 4)); - } - } else { - // Further tweaks for Blackwell with NVLS registered buffers - info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 7 : 6)); - } + int recChannels; + NCCLCHECK(ncclNvlsRegResourcesQuery(comm, info, &recChannels)); + info->nMaxChannels = recChannels; info->regBufType |= NCCL_NVLS_REG_BUFFER; } @@ -188,7 +191,7 @@ ncclResult_t ncclRegisterCollBuffers( struct ncclChannel* channel = comm->channels; int ipcRegFlag = 0, netSendRegFlag = 0, netRecvRegFlag = 0; void *sendHandle, *recvHandle; - if (info->func != ncclFuncReduceScatter && comm->intraNodeP2pSupport) { + if (info->func != ncclFuncReduceScatter && comm->isAllDirectP2p) { for (int r = 0; r < NCCL_MAX_DIRECT_ARITY; ++r) { for (int down = 0; down < 2; ++down) { int peer = down ? channel->collnetDirect.down[r] : channel->collnetDirect.up[r]; @@ -308,7 +311,7 @@ ncclResult_t ncclRegisterCollBuffers( } } } - if (nPeers > 0 && comm->intraNodeP2pSupport) { + if (nPeers > 0 && comm->isAllDirectP2p) { if (comm->planner.persistent && ncclParamGraphRegister()) { ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts); } @@ -365,7 +368,7 @@ ncclResult_t ncclRegisterCollBuffers( void *sendHandle, *recvHandle; NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord)); if (recvRegRecord == NULL && !(comm->planner.persistent && ncclParamGraphRegister())) goto exit; - if (comm->intraNodeP2pSupport) { + if (comm->isAllDirectP2p) { for (int c = 0; c < comm->nChannels; ++c) { struct ncclChannel* channel = comm->channels + c; struct ncclTree* tree = NULL; diff --git a/src/register/register.cc b/src/register/register.cc index 930367a97..59928f57e 100644 --- a/src/register/register.cc +++ b/src/register/register.cc @@ -10,24 +10,21 @@ #include "net.h" #include "register.h" #include "transport.h" +#include "group.h" -ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg) { - struct ncclRegCache* cache = &comm->regCache; - uintptr_t pageSize = cache->pageSize; - uintptr_t addr = (uintptr_t)data & -pageSize; - size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize; +NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1); - *reg = NULL; - for (int slot=0; /*true*/; slot++) { - if (slot == cache->population || addr < cache->slots[slot]->addr) return ncclSuccess; - if ((addr >= cache->slots[slot]->addr) && - ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) { - *reg = cache->slots[slot]; - return ncclSuccess; +static ncclResult_t regFindHandleFromSymAddr(struct ncclComm* comm, void* baseSymPtr, struct ncclReg** handle) { + struct ncclRegCache* cache = &comm->regCache; + *handle = NULL; + for (int slot = 0; slot < cache->population; slot++) { + if (baseSymPtr == cache->slots[slot]->baseSymPtr) { + *handle = cache->slots[slot]; + break; } } + return ncclSuccess; } -NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1); ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid) { if (reg && isValid) { @@ -43,14 +40,14 @@ ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, bool i NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm")); struct ncclRegCache* cache = &comm->regCache; uintptr_t pageSize = cache->pageSize; - uintptr_t addr = (uintptr_t)data & -pageSize; - size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize; + uintptr_t begAddr = (uintptr_t)data & -pageSize; + uintptr_t endAddr = ((uintptr_t)data + size + pageSize-1) & -pageSize; if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(data, comm, "buff", "ncclCommRegister")); INFO(NCCL_REG, "register comm %p buffer %p size %zi", comm, data, size); for (int slot=0; /*true*/; slot++) { - if ((slot == cache->population) || (addr < cache->slots[slot]->addr)) { + if ((slot == cache->population) || (begAddr < cache->slots[slot]->begAddr)) { if (cache->population == cache->capacity) { // must grow cache cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity; NCCLCHECK(ncclRealloc(&cache->slots, cache->population, cache->capacity)); @@ -58,15 +55,15 @@ ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, bool i memmove(cache->slots+slot+1, cache->slots+slot, (cache->population-slot)*sizeof(struct ncclReg*)); NCCLCHECK(ncclCalloc(cache->slots+slot, 1)); struct ncclReg* regSlot = cache->slots[slot]; - regSlot->addr = addr; - regSlot->pages = pages; + regSlot->begAddr = begAddr; + regSlot->endAddr = endAddr; if (isGraph) regSlot->graphRefs = 1; else regSlot->localRefs = 1; cache->population += 1; *handle = regSlot; goto exit; - } else if ((addr >= cache->slots[slot]->addr) && - ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) { + } else if ((cache->slots[slot]->begAddr <= begAddr) && + (cache->slots[slot]->endAddr >= endAddr)) { if (isGraph) cache->slots[slot]->graphRefs++; else cache->slots[slot]->localRefs++; *handle = cache->slots[slot]; @@ -120,7 +117,7 @@ ncclResult_t ncclRegCleanup(struct ncclComm* comm) { struct ncclRegCache* cache = &comm->regCache; for (int i = 0; i < cache->population; i++) { struct ncclReg* reg = cache->slots[i]; - INFO(NCCL_INIT, "Cleanup buffer %p pages %lx", (void*)reg->addr, reg->pages); + INFO(NCCL_INIT, "Cleanup buffer %p pages %lx", (void*)reg->begAddr, (reg->endAddr-reg->begAddr)/cache->pageSize); NCCLCHECK(regCleanup(comm, reg)); free(reg); } @@ -177,3 +174,104 @@ ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *hand NCCLCHECK(commDeregister(comm, true, handle)); return ncclSuccess; } + +ncclResult_t ncclCommSymmetricRegisterInternal(struct ncclComm* comm, void* buff, size_t baseSize, size_t alignment, CUmemGenericAllocationHandle memHandle, struct ncclReg* regHandle) { + ncclResult_t ret = ncclSuccess; + void* regSymAddr = NULL; + ALIGN_SIZE(comm->symAllocHead, alignment); + NCCLCHECKGOTO(ncclIpcSymmetricMap(comm, comm->symAllocHead, baseSize, memHandle, ®SymAddr), ret, fail); + NCCLCHECKGOTO(ncclNvlsSymmetricMap(comm, comm->symAllocHead, baseSize, regSymAddr), ret, fail); + NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail); + comm->symAllocHead += baseSize; + regHandle->baseSymPtr = regSymAddr; + regHandle->symSize = baseSize; +exit: + return ret; +fail: + regHandle->baseSymPtr = NULL; + regHandle->symSize = 0; + goto exit; +} + +NCCL_API(ncclResult_t, ncclCommWindowRegister, ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags); +ncclResult_t ncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags) { + ncclResult_t ret = ncclSuccess; + CUmemGenericAllocationHandle memHandle; + size_t baseSize; + void* baseAddr = NULL; + struct ncclReg* regHandle = NULL; + int saveDev; + + *win = NULL; + + CUDACHECK(cudaGetDevice(&saveDev)); + NCCLCHECK(ncclGroupStartInternal()); + if (!ncclParamLocalRegister() || !ncclCuMemEnable()) { + goto exit; + } + + NCCLCHECKGOTO(ncclCommEnsureReady(comm), ret, fail); + + CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail); + if (comm && buff && size && win) { + size_t alignment = 0; + CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)buff), ret, fail); + // size and alignment check + if (!((uintptr_t)baseAddr % NCCL_REC_PAGE_SIZE == 0 && baseSize % NCCL_REC_PAGE_SIZE == 0 && (uintptr_t)buff + size <= (uintptr_t)baseAddr + baseSize)) { + WARN("buffer %p (baseAddr %p align %d) size %zu (baseSize %ld align %d) does not satisfy symmetric registration requirements", buff, baseAddr, (uintptr_t)baseAddr % NCCL_REC_PAGE_SIZE == 0, size, baseSize, baseSize % NCCL_REC_PAGE_SIZE == 0); + goto fail; + } + NCCLCHECKGOTO(ncclRegister(comm, baseAddr, baseSize, false, (void**)®Handle), ret, fail); + NCCLCHECKGOTO(ncclCalloc(win, 1), ret, fail); + (*win)->handle = regHandle; + regHandle->winFlags = winFlags; + if (regHandle->baseSymPtr == NULL && comm->symmetricSupport) { + struct ncclSymRegTask* task; + CUCHECKGOTO(cuMemRetainAllocationHandle(&memHandle, baseAddr), ret, fail); + CUCHECKGOTO(cuMemRelease(memHandle), ret, fail); + alignment = baseSize >= NCCL_REC_PAGE_SIZE * 72L ? NCCL_MAX_PAGE_SIZE : NCCL_REC_PAGE_SIZE; + NCCLCHECKGOTO(ncclCalloc(&task, 1), ret, fail); + task->buff = buff; + task->baseSize = baseSize; + task->memHandle = memHandle; + task->regHandle = regHandle; + task->alignment = alignment; + ncclIntruQueueEnqueue(&comm->symRegTaskQueue, task); + ncclGroupCommJoin(comm, ncclGroupTaskTypeSymRegister); + } + } + +exit: + ncclGroupErrCheck(ret); + NCCLCHECK(ret = ncclGroupEndInternal()); + cudaSetDevice(saveDev); + return ret; +fail: + free(*win); + *win = NULL; + goto exit; +} + +NCCL_API(ncclResult_t, ncclCommWindowDeregister, ncclComm_t comm, ncclWindow_t win); +ncclResult_t ncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win) { + ncclResult_t ret = ncclSuccess; + int saveDev; + struct ncclReg* regHandle; + CUDACHECK(cudaGetDevice(&saveDev)); + if (win == NULL) goto exit; + regHandle = win->handle; + if (regHandle && ncclParamLocalRegister() && ncclCuMemEnable()) { + if (regHandle->baseSymPtr) { + CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail); + NCCLCHECKGOTO(ncclNvlsSymmetricFree(comm, regHandle->symSize, regHandle->baseSymPtr), ret, fail); + NCCLCHECKGOTO(ncclIpcSymmetricFree(comm, regHandle->symSize, regHandle->baseSymPtr), ret, fail); + } + NCCLCHECKGOTO(commDeregister(comm, false, regHandle), ret, fail); + } + free(win); +exit: + CUDACHECK(cudaSetDevice(saveDev)); + return ret; +fail: + goto exit; +} diff --git a/src/symmetric.cc b/src/symmetric.cc new file mode 100644 index 000000000..f5b1e6c22 --- /dev/null +++ b/src/symmetric.cc @@ -0,0 +1,296 @@ +#include "symmetric.h" +#include "comm.h" +#include "device.h" +#include + +constexpr char const* kernelName[] = { + // Must align with enum ncclSymKernelId definition in src/include/symmetric.h + "AllReduce_AGxLL_R", + "AllReduce_AGxLLMC_R", + "AllReduce_RSxLD_AGxST", + "AllReduce_RSxLDMC_AGxSTMC", + "AllGather_LL", + "AllGather_LLMC", + "AllGather_ST", + "AllGather_STMC", + "ReduceScatter_LL", + "ReduceScatter_LD", + "ReduceScatter_LDMC" +}; + +constexpr uint32_t kernelMask_STMC = 1<nRanks; + int nMaxBlocks = ncclSymMaxBlocks; + int nMaxBlocksNvls = divUp((comm->cudaArch < 1000 ? 16 : 32), nRanks); + size_t busBytes; // max(bytes sent, bytes received) + double busMultiplier = 1; + + switch (k) { + default: + busBytes = size_t(1)<<50; + break; + + case ncclSymKernelId_AllReduce_AGxLL_R: + busBytes = nRanks*nBytes*LL_BusFactor; + break; + case ncclSymKernelId_AllReduce_AGxLLMC_R: + busBytes = nRanks*nBytes*LL_BusFactor; + busMultiplier = 1.1; // To beat non-MC LL + break; + case ncclSymKernelId_AllReduce_RSxLD_AGxST: + busBytes = 2*nBytes*(nRanks-1)/nRanks; + break; + case ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC: + busBytes = nBytes/nRanks + nBytes; + busMultiplier = nRanks; + nMaxBlocks = nMaxBlocksNvls; + break; + + case ncclSymKernelId_AllGather_LL: + busBytes = nRanks*nBytes*LL_BusFactor; + break; + case ncclSymKernelId_AllGather_LLMC: + busBytes = nRanks*nBytes*LL_BusFactor; + busMultiplier = 1.1; // To beat non-MC LL + break; + case ncclSymKernelId_AllGather_ST: + busBytes = (nRanks-1)*nBytes; + break; + case ncclSymKernelId_AllGather_STMC: + busBytes = (nRanks-1)*nBytes; // Wrong. Should be nRanks*nBytes but we want to beat non-MC. + busMultiplier = 0.55*nRanks; + nMaxBlocks = nMaxBlocksNvls; + break; + + case ncclSymKernelId_ReduceScatter_LL: + busBytes = nRanks*nBytes*LL_BusFactor; + break; + case ncclSymKernelId_ReduceScatter_LD: + busBytes = (nRanks-1)*nBytes; + break; + case ncclSymKernelId_ReduceScatter_LDMC: + busBytes = (nRanks-1)*nBytes; // Wrong. Should be nRanks*nBytes but we want to beat non-MC. + busMultiplier = 0.55*nRanks; + nMaxBlocks = nMaxBlocksNvls; + break; + } + + nMaxBlocks = std::min(nMaxBlocks, comm->config.maxCTAs); + int nMinBlocks = comm->config.minCTAs; + + int nUserCTAs = std::min(ncclSymMaxBlocks, ncclParamSymCTAs()); + if (nUserCTAs > 0) nMinBlocks = nMaxBlocks = nUserCTAs; + + bool isLL = kernelMask_LL>>k & 1; + bool isAG = kernelMask_AG>>k & 1; + bool isAR = kernelMask_AR>>k & 1; + constexpr double GBps = (1<<30)/1.e6; + double baseLat, smBw, peakBw; + if (comm->cudaArch < 1000) { + baseLat = isLL ? 4.5 : 7.8; + smBw = isAR ? 65*GBps : 44*GBps; + peakBw = k == ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC ? 480*GBps : 320*GBps; + } else { + baseLat = isLL ? (isAG ? 8.5 : 11) : (isAR ? 19.5 : 13.0); + smBw = 55*GBps; + peakBw = k == ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC ? 1000*GBps : 600*GBps; + } + *nBlocks = nMaxBlocks; + *timeUs = model(busBytes, baseLat, nMaxBlocks, smBw, busMultiplier, peakBw); + // Use least number of blocks that puts us within a tolerance of peak performance. + for (int bn = nMinBlocks; bn < nMaxBlocks; bn++) { + double time = model(busBytes, baseLat, bn, smBw, busMultiplier, peakBw); + if (time <= 1.025*(*timeUs)) { + *nBlocks = bn; + *timeUs = time; + break; + } + } +} + +bool ncclSymImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty) { + bool isFloat; + switch (ty) { + case ncclFloat64: + case ncclFloat32: + case ncclFloat16: + case ncclBfloat16: + case ncclFloat8e4m3: + case ncclFloat8e5m2: + isFloat = true; + break; + default: + isFloat = false; + break; + } + + switch (coll) { + case ncclFuncAllGather: + return true; + case ncclFuncAllReduce: + case ncclFuncReduceScatter: + return red == ncclDevSum && isFloat && ty != ncclFloat64; + default: + return false; + } +} + +ncclResult_t ncclSymPickKernel( + struct ncclComm* comm, ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, size_t nElts, + float* estTimeUs, ncclSymKernelId* kernelId, int* nBlocks, int* nWarps + ) { + uint32_t kmask = kernelMask_coll(coll); + kmask &= kernelMask_user(); + + bool hasSTMC = comm->nvlsSupport; + bool hasLDMC = false; + if (comm->nvlsSupport) { + switch (ty) { + case ncclInt32: + case ncclUint32: + case ncclInt64: + case ncclUint64: + case ncclFloat16: + case ncclBfloat16: + hasLDMC = red == ncclDevSum || red == ncclDevMinMax; + break; + case ncclFloat8e4m3: + case ncclFloat8e5m2: + hasLDMC = red == ncclDevSum || red == ncclDevMinMax; + hasLDMC &= comm->compCap >= 100; + break; + case ncclFloat: + case ncclDouble: + hasLDMC = red == ncclDevSum; + break; + default: break; + } + } + if (!hasSTMC) kmask &= ~kernelMask_STMC; + if (!hasLDMC) kmask &= ~kernelMask_LDMC; + + size_t nBytes = nElts*ncclTypeSize(ty); + size_t nBusBytes = (coll == ncclFuncAllReduce ? 1 : comm->nRanks)*nBytes; + // LL kernels use 32-bit ints to track element counts and indices. + if (nBusBytes >= (size_t(2)<<30)) kmask &= ~kernelMask_LL; + // Any kernel might use 32-bit int to track unrolled loop chunks (which are going + // to be at least 32 bytes per chunk) + if (nBusBytes >= 32*(size_t(2)<<30)) kmask = 0; + + ncclSymKernelId bestKernel = ncclSymKernelId_Count; + float bestTime = 1.e30f; + int bestBlocks = 999; + + constexpr float smPenalty = .025f; // 2.5% percent increase in time per SM + uint32_t kmaskRemain = kmask; + while (kmaskRemain != 0) { + ncclSymKernelId k = (ncclSymKernelId)popFirstOneBit(&kmaskRemain); + float kTime; + int kBlocks; + queryModel(comm, k, nBytes, &kTime, &kBlocks); + if (kTime*(1.0f + smPenalty*kBlocks) < bestTime*(1.0f + smPenalty*bestBlocks)) { + bestKernel = k; + bestTime = kTime; + bestBlocks = kBlocks; + } + } + + *kernelId = bestKernel; + *estTimeUs = kmask==0 || kernelMask_user() == (1<= ncclSymKernelId_Count) { + return "Unknown"; + } + return kernelName[kernelId]; +} diff --git a/src/transport.cc b/src/transport.cc index f98b77a43..d98b98b1b 100644 --- a/src/transport.cc +++ b/src/transport.cc @@ -71,7 +71,7 @@ NCCL_PARAM(ConnectRoundMaxPeers, "CONNECT_ROUND_MAX_PEERS", 128); NCCL_PARAM(ReportConnectProgress, "REPORT_CONNECT_PROGRESS", 0); #include -ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode) { +ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* isAllDirectP2p, bool* directMode) { bool supportFlag = true; bool directFlag = false; if (comm->localRanks == 1) { @@ -84,8 +84,9 @@ ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2p struct ncclPeerInfo* ipeerInfo = &comm->peerInfo[ipeer]; struct ncclPeerInfo* jpeerInfo = &comm->peerInfo[jpeer]; int canConnect = 0; - NCCLCHECK(ncclTransports[0]->canConnect(&canConnect, comm, NULL, ipeerInfo, jpeerInfo)); - if (!canConnect && supportFlag == true) { + int intermediateRank = -1; + NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, ipeerInfo->rank, jpeerInfo->rank, &canConnect, NULL, &intermediateRank)); + if (!canConnect || intermediateRank != -1) { supportFlag = false; } if (ipeerInfo->hostHash == jpeerInfo->hostHash && ipeerInfo->pidHash == jpeerInfo->pidHash) directFlag = true; @@ -93,9 +94,9 @@ ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2p } } } - *intraNodeP2pSupport = supportFlag; + *isAllDirectP2p = supportFlag; *directMode = directFlag; - if (comm->rank == 0) INFO(NCCL_INIT, "Check P2P Type intraNodeP2pSupport %d directMode %d", supportFlag, directFlag); + if (comm->rank == 0) INFO(NCCL_INIT, "Check P2P Type isAllDirectP2p %d directMode %d", supportFlag, directFlag); return ncclSuccess; } diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc index 84e1f84a0..386865e21 100644 --- a/src/transport/coll_net.cc +++ b/src/transport/coll_net.cc @@ -13,6 +13,7 @@ #include "assert.h" #include "bootstrap.h" #include "channel.h" +#include "register_inline.h" int64_t ncclParamGdrCopySyncEnable(); int64_t ncclParamGdrCopyFlushEnable(); @@ -1188,7 +1189,7 @@ static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* use goto exit; } else { /* start register collnet buffer */ - struct collnetRegInfo info = { regRecord->addr, regRecord->pages * comm->regCache.pageSize }; + struct collnetRegInfo info = { regRecord->begAddr, regRecord->endAddr - regRecord->begAddr }; void* handle = NULL; struct ncclConnInfo* conn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].conn : &comm->channels[0].peers[comm->nRanks]->send[type].conn; @@ -1389,7 +1390,7 @@ ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm) { ncclResult_t ret = ncclSuccess; char line[1024]; - if (comm->collNetSupport == 0) goto exit; + if (comm->config.collnetEnable == 0) goto exit; // Connect Collnet + chain for (int c = 0; c < comm->nChannels; c++) { struct ncclChannel* channel = comm->channels + c; @@ -1421,7 +1422,7 @@ ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm) { ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm) { ncclResult_t ret = ncclSuccess; - if (comm->collNetSupport == 0) goto exit; + if (comm->config.collnetEnable == 0) goto exit; // Connect intra-node CollNet + Direct for (int c = 0; c < comm->nChannels; c++) { @@ -1498,8 +1499,8 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop comm->collNetHeads = headsUnique; comm->collNetHeadsNum = nHeadsUnique; - if (parent && parent->collNetSupport && parent->nNodes == comm->nNodes) { - if (!parent->config.splitShare) { + if (parent && parent->config.collnetEnable && parent->nNodes == comm->nNodes) { + if (!parent->shareResources) { collNetSetupFail = 1; goto fail; } @@ -1547,9 +1548,6 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop NCCLCHECKGOTO(collNetInitRailRankMap(comm), ret, fail); } else { - /* TODO: CX-6 and CX-7 both do not support multiple sharp resources per process, if child comm cannot - * share the sharp resource from parent, we cannot use sharp in this case. This restriction might be - * lifted by sharp plugin/IB hardware in the future. */ collNetSetupFail = 1; if (comm->rank == 0) { WARN("Child comms (nRanks %d) fails to share parent comms (nRanks %d) sharp resources", comm->nRanks, parent->nRanks); @@ -1629,7 +1627,7 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop return ret; fail: ncclTransportCollNetFree(comm); - comm->collNetSupport = 0; + comm->config.collnetEnable = 0; goto exit; } diff --git a/src/transport/net.cc b/src/transport/net.cc index 61b15ce20..c0cd20d6e 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -16,6 +16,7 @@ #include "transport.h" #include "shm.h" #include +#include "register_inline.h" static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large"); @@ -629,8 +630,6 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc resources->netDeviceVersion = props.netDeviceVersion; resources->netDeviceType = props.netDeviceType; - resources->netDeviceVersion = props.netDeviceVersion; - resources->netDeviceType = props.netDeviceType; /* point-to-point size limits*/ resources->maxP2pBytes = props.maxP2pBytes; if((resources->maxP2pBytes <= 0) || (resources->maxP2pBytes > NCCL_MAX_NET_SIZE_BYTES)) { @@ -732,7 +731,14 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks)); } struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteRank; - if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle); + // let only one localrank connect to a tpRemoteRank to avoid duplicate connections + if (comms->activeConnect[resources->channelId] == 0) + comms->activeConnect[resources->channelId] = (resources->tpLocalRank + 1); + if (comms->sendComm[resources->channelId] == NULL + && comms->activeConnect[resources->channelId] == (resources->tpLocalRank + 1)) { + ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, + comms->sendComm + resources->channelId, &resources->netDeviceHandle); + } resources->netSendComm = comms->sendComm[resources->channelId]; if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++; } else { @@ -886,7 +892,15 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks)); } struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteProxyRank; - if (comms->recvComm[resources->channelId] == NULL) ret = proxyState->ncclNet->accept(resources->netListenComm, comms->recvComm+resources->channelId, &resources->netDeviceHandle); + // reuse handle to for netdev/remote rank to avoid duplicate connections + if (comms->activeAccept[resources->channelId] == 0) + comms->activeAccept[resources->channelId] = (resources->tpLocalRank + 1); + //try connecting while comm is null + if (comms->recvComm[resources->channelId] == NULL + && comms->activeAccept[resources->channelId] == (resources->tpLocalRank + 1)) { + ret = proxyState->ncclNet->accept(resources->netListenComm, + comms->recvComm+resources->channelId, &resources->netDeviceHandle); + } resources->netRecvComm = comms->recvComm[resources->channelId]; if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++; } else { @@ -1101,7 +1115,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct // Set step base for next op resources->step = sub->base + sub->nsteps; sub->posted = sub->transmitted = sub->done = 0; - ncclProfilerStartSendProxyOpEvent(s, args); + ncclProfilerRecordProxyOpEventState(s, args, ncclProfilerProxyOpInProgress_v4); if (!sub->reg) sub->sendMhandle = resources->mhandles[args->protocol]; } @@ -1140,7 +1154,6 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct } else { sub->posted += args->sliceSteps; } - ncclProfilerRecordProxyOpEventState(s, args, sub->posted, sub->transSize, ncclProfilerProxyOpSendPosted); ncclProfilerRecordProxyStepEventState(s, args, postedStepId, ncclProfilerProxyStepSendGPUWait); args->idle = 0; continue; @@ -1188,18 +1201,17 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct } } if (ready) { - ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted+args->sliceSteps, sub->transSize, ncclProfilerProxyOpSendRemFifoWait); + ncclProfilerRecordProxyStepEventState(s, args, transmittedStepId, ncclProfilerProxyStepSendPeerWait_v4); // Data is ready, try to send. // Coverity complains about the size here as pointing to an out-of-scope temporary. Which is nonsense, // since size is a plain integer. // coverity[use_invalid:FALSE] - NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, sub, sub->requests+buffSlot)); + void* phandle = &sub->pHandles[DIVUP(transmittedStepId, args->sliceSteps)%NCCL_STEPS]; + NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, phandle, sub->requests+buffSlot)); if (sub->requests[buffSlot] != NULL) { TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Isend posted, req %p, buff %p, size %d, proto %d, myRank %d, channelId %d, mhandle %p", sub->transmitted, buffSlot, sub->nsteps, sub->requests[buffSlot], buff, size, p, proxyState->tpRank, sub->channelId, sub->sendMhandle); - sub->transSize += size; + sub->transSize = size; sub->transmitted += args->sliceSteps; - sub->profilerSteps++; - ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpSendTransmitted); ncclProfilerRecordProxyStepEventState(s, args, transmittedStepId, ncclProfilerProxyStepSendWait); args->idle = 0; continue; @@ -1220,7 +1232,6 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] request %p done", sub->done, buffSlot, sub->nsteps, sub->requests[buffSlot]); sub->done += args->sliceSteps; ncclProfilerStopProxyStepEvent(s, args, doneStepId); - ncclProfilerRecordProxyOpEventState(s, args, sub->done, sub->transSize, ncclProfilerProxyOpSendDone); if (resources->shared == 0) { volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head; @@ -1282,7 +1293,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct sub->posted = sub->received = sub->transmitted = sub->done = 0; sub->regBufferReady = 0; for (int i=0; ireg) sub->recvMhandle = resources->mhandles[args->protocol]; } @@ -1343,7 +1354,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes; tags[subCount] = resources->tpRemoteRank; mhandles[subCount] = sub->recvMhandle; - phandles[subCount] = sub; + phandles[subCount] = &sub->pHandles[DIVUP(postedStepId, args->sliceSteps)%NCCL_STEPS]; subCount++; } } @@ -1362,8 +1373,6 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct int postedStepId = sub->posted; TRACE(NCCL_NET, "recvProxy [%ld/%ld/%d] Irecv posted, buff %p, size %ld, myRank %d, channelId %d, mhandle %p", sub->posted, (sub->base + sub->posted) % NCCL_STEPS, sub->nsteps, ptrs[i], sizes[i], proxyState->tpRank, sub->channelId, mhandles[i]); sub->posted += args->sliceSteps; - sub->profilerSteps++; - ncclProfilerRecordProxyOpEventState(s+i, args, sub->posted, sub->transSize, ncclProfilerProxyOpRecvPosted); ncclProfilerRecordProxyStepEventState(s+i, args, postedStepId, ncclProfilerProxyStepRecvWait); } args->idle = 0; @@ -1393,9 +1402,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct struct recvNetResources* resources = (struct recvNetResources*)(sub->connection->transportResources); volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo; connFifo[buffSlot].size = -1; - sub->transSize += sizes[i]; + sub->transSize = sizes[i]; sub->received += args->sliceSteps; - ncclProfilerRecordProxyOpEventState(s+i, args, sub->received, sub->transSize, ncclProfilerProxyOpRecvReceived); ncclProfilerRecordProxyStepEventState(s+i, args, receivedStepId, ncclProfilerProxyStepRecvFlushWait); if (step < sub->nsteps) { struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); @@ -1459,7 +1467,6 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct int transmittedStepId = sub->transmitted; sub->transmitted += args->sliceSteps; - ncclProfilerRecordProxyOpEventState(s+i, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpRecvTransmitted); ncclProfilerRecordProxyStepEventState(s+i, args, transmittedStepId, ncclProfilerProxyStepRecvGPUWait); if (step < sub->nsteps) { __sync_synchronize(); @@ -1479,7 +1486,6 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct struct ncclProxySubArgs* subGroup = args->subs+s; for (int i=0; igroupSize; i++) { struct ncclProxySubArgs* sub = subGroup + i; - int doneStepId = sub->done; if (sub->done == sub->nsteps) continue; if (sub->transmitted > sub->done) { struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); @@ -1494,9 +1500,9 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct NCCLCHECK(proxyState->ncclNet->irecvConsumed(resources->netRecvComm, subGroup->recvRequestsSubCount, subGroup->recvRequestsCache[sub->done%NCCL_STEPS])); subGroup->recvRequestsCache[sub->done%NCCL_STEPS] = NULL; } + int doneStepId = sub->done; sub->done += args->sliceSteps; ncclProfilerStopProxyStepEvent(s+i, args, doneStepId); - ncclProfilerRecordProxyOpEventState(s+i, args, sub->done, sub->transSize, ncclProfilerProxyOpRecvDone); args->idle = 0; if (sub->done == sub->nsteps) { args->done++; @@ -1547,9 +1553,9 @@ static ncclResult_t netRegisterBuffer(ncclComm* comm, const void* userbuff, size if (found) { *outRegBufFlag = 1; outHandle[p] = netHandle->handle; - INFO(NCCL_REG, "rank %d - NET reuse buffer %p size %ld (baseAddr %p size %ld) handle %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, netHandle->handle); + INFO(NCCL_REG, "rank %d - NET reuse buffer %p size %ld (baseAddr %p size %ld) handle %p", comm->rank, userbuff, buffSize, (void*)regRecord->begAddr, regRecord->endAddr - regRecord->begAddr, netHandle->handle); } else { - struct netRegInfo info = { regRecord->addr, regRecord->pages * comm->regCache.pageSize }; + struct netRegInfo info = { regRecord->begAddr, regRecord->endAddr - regRecord->begAddr }; void* handle = NULL; if (peerConn->conn.flags & NCCL_DIRECT_NIC) { diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index c049531f8..19a505e1c 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -25,8 +25,10 @@ #include "timer.h" #include "ibvwrap.h" +#include "mlx5/mlx5dvwrap.h" -#define MAXNAMESIZE 64 +#define MAXSUFFIXSIZE 16 +#define MAXNAMESIZE (64 + MAXSUFFIXSIZE) static char ncclIbIfName[MAX_IF_NAME_SIZE+1]; static union ncclSocketAddress ncclIbIfAddr; @@ -55,6 +57,17 @@ struct ncclIbStats { int fatalErrorCount; }; +enum ncclIbProvider { + IB_PROVIDER_NONE = 0, + IB_PROVIDER_MLX5 = 1, + IB_PROVIDER_MAX = 2, +}; + +const char* ibProviderName[] = { + "None", + "Mlx5", +}; + static int ncclNIbDevs = -1; struct alignas(64) ncclIbDev { pthread_mutex_t lock; @@ -77,6 +90,12 @@ struct alignas(64) ncclIbDev { struct ibv_port_attr portAttr; struct ncclIbStats stats; int dmaBufSupported; + enum ncclIbProvider ibProvider; + union { + struct { + int dataDirect; + } mlx5; + } capsProvider; }; #define MAX_IB_DEVS 32 @@ -106,6 +125,7 @@ NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2); NCCL_PARAM(IbFifoTc, "IB_FIFO_TC", -1); NCCL_PARAM(IbAsyncEvents,"IB_RETURN_ASYNC_EVENTS",1); NCCL_PARAM(IbEceEnable,"IB_ECE_ENABLE",1); +NCCL_PARAM(IbDataDirect,"IB_DATA_DIRECT",1); static ncclResult_t ncclIbStatsInit(struct ncclIbStats* stat) { __atomic_store_n(&stat->fatalErrorCount, 0, __ATOMIC_RELAXED); @@ -451,6 +471,10 @@ static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort) if (p == NULL) { WARN("Could not find real path of %s (%s)", devName, devicePath); } else { + // Merge multi-port NICs into the same PCI device + p[strlen(p)-1] = '0'; + // Also merge virtual functions (VF) into the same device + if (ncclParamIbMergeVfs()) p[strlen(p)-3] = p[strlen(p)-4] = '0'; // Keep the real port aside (the ibv port is always 1 on recent cards) *realPort = 0; for (int d=0; dndevs > 1) { - WARN("NET/IB : Trying to merge multiple devices together when NCCL_IB_MERGE_NICS=0. Please enable it or disable device merging in NCCL."); + INFO(NCCL_NET, "NET/IB : Skipping makeVDevice, NCCL_IB_MERGE_NICS=0"); return ncclInvalidUsage; } @@ -565,14 +609,17 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr if (ncclParamIbDisable()) return ncclInternalError; static int shownIbHcaEnv = 0; if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; } + if(wrap_mlx5dv_symbols() != ncclSuccess) { INFO(NCCL_NET, "NET/IB : Failed to open mlx5dv symbols. Advance features like CX-8 Direct-NIC will be disabled."); } if (ncclNIbDevs == -1) { pthread_mutex_lock(&ncclIbLock); wrap_ibv_fork_init(); if (ncclNIbDevs == -1) { + int nIpIfs = 0; ncclNIbDevs = 0; ncclNMergedIbDevs = 0; - if (ncclFindInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) { + NCCLCHECK(ncclFindInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1, &nIpIfs)); + if (nIpIfs != 1) { WARN("NET/IB : No IP interface found."); ret = ncclInternalError; goto fail; @@ -600,6 +647,17 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr WARN("NET/IB : Unable to open device %s", devices[d]->name); continue; } + enum ncclIbProvider ibProvider = IB_PROVIDER_NONE; + char dataDirectDevicePath[PATH_MAX]; + int dataDirectSupported = 0; + if (wrap_mlx5dv_is_supported(devices[d])) { + ibProvider = IB_PROVIDER_MLX5; + snprintf(dataDirectDevicePath, PATH_MAX, "/sys"); + if((ncclMlx5dvDmaBufCapable(context)) && (wrap_mlx5dv_get_data_direct_sysfs_path(context, dataDirectDevicePath + 4, PATH_MAX - 4) == ncclSuccess)) { + INFO(NCCL_NET, "Data Direct DMA Interface is detected for device:%s", devices[d]->name); + if(ncclParamIbDataDirect()) dataDirectSupported = 1; + } + } int nPorts = 0; struct ibv_device_attr devAttr; memset(&devAttr, 0, sizeof(devAttr)); @@ -609,58 +667,69 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr continue; } for (int port_num = 1; port_num <= devAttr.phys_port_cnt; port_num++) { - struct ibv_port_attr portAttr; - if (ncclSuccess != wrap_ibv_query_port(context, port_num, &portAttr)) { - WARN("NET/IB : Unable to query port_num %d", port_num); - continue; - } - if (portAttr.state != IBV_PORT_ACTIVE) continue; - if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND - && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue; + for (int dataDirect = 0; dataDirect < 1 + dataDirectSupported; ++dataDirect) { + struct ibv_port_attr portAttr; + if (ncclSuccess != wrap_ibv_query_port(context, port_num, &portAttr)) { + WARN("NET/IB : Unable to query port_num %d", port_num); + continue; + } + if (portAttr.state != IBV_PORT_ACTIVE) continue; + if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND + && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue; - // check against user specified HCAs/ports - if (! (matchIfList(devices[d]->name, port_num, userIfs, nUserIfs, searchExact) ^ searchNot)) { - continue; + // check against user specified HCAs/ports + if (! (matchIfList(devices[d]->name, port_num, userIfs, nUserIfs, searchExact) ^ searchNot)) { + continue; + } + pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL); + ncclIbDevs[ncclNIbDevs].device = d; + ncclIbDevs[ncclNIbDevs].ibProvider = ibProvider; + ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid; + ncclIbDevs[ncclNIbDevs].portAttr = portAttr; + ncclIbDevs[ncclNIbDevs].portNum = port_num; + ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer; + ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width); + ncclIbDevs[ncclNIbDevs].context = context; + ncclIbDevs[ncclNIbDevs].pdRefs = 0; + ncclIbDevs[ncclNIbDevs].pd = NULL; + if (!dataDirect) { + strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE); + NCCLCHECKGOTO(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort), ret, fail); + } + else { + snprintf(ncclIbDevs[ncclNIbDevs].devName, MAXNAMESIZE, "%s_dma", devices[d]->name); + NCCLCHECK(ncclCalloc(&ncclIbDevs[ncclNIbDevs].pciPath, PATH_MAX)); + strncpy(ncclIbDevs[ncclNIbDevs].pciPath, dataDirectDevicePath, PATH_MAX); + ncclIbDevs[ncclNIbDevs].capsProvider.mlx5.dataDirect = 1; + } + ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp; + ncclIbDevs[ncclNIbDevs].mrCache.capacity = 0; + ncclIbDevs[ncclNIbDevs].mrCache.population = 0; + ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL; + NCCLCHECK(ncclIbStatsInit(&ncclIbDevs[ncclNIbDevs].stats)); + + // Enable ADAPTIVE_ROUTING by default on IB networks + // But allow it to be overloaded by an env parameter + ncclIbDevs[ncclNIbDevs].ar = (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND) ? 1 : 0; + if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting(); + + INFO(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s provider=%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum, + NCCL_IB_LLSTR(portAttr.link_layer), ibProviderName[ncclIbDevs[ncclNIbDevs].ibProvider], ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar); + + PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail); + ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs); + PTHREADCHECKGOTO(pthread_detach(ncclIbAsyncThread), "pthread_detach", ret, fail); // will not be pthread_join()'d + + // Add this plain physical device to the list of virtual devices + int vDev; + ncclNetVDeviceProps_t vProps = {0}; + vProps.ndevs = 1; + vProps.devs[0] = ncclNIbDevs; + NCCLCHECK(ncclIbMakeVDeviceInternal(&vDev, &vProps)); + + ncclNIbDevs++; + nPorts++; } - pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL); - ncclIbDevs[ncclNIbDevs].device = d; - ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid; - ncclIbDevs[ncclNIbDevs].portAttr = portAttr; - ncclIbDevs[ncclNIbDevs].portNum = port_num; - ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer; - ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width); - ncclIbDevs[ncclNIbDevs].context = context; - ncclIbDevs[ncclNIbDevs].pdRefs = 0; - ncclIbDevs[ncclNIbDevs].pd = NULL; - strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE); - NCCLCHECKGOTO(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort), ret, fail); - ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp; - ncclIbDevs[ncclNIbDevs].mrCache.capacity = 0; - ncclIbDevs[ncclNIbDevs].mrCache.population = 0; - ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL; - NCCLCHECK(ncclIbStatsInit(&ncclIbDevs[ncclNIbDevs].stats)); - - // Enable ADAPTIVE_ROUTING by default on IB networks - // But allow it to be overloaded by an env parameter - ncclIbDevs[ncclNIbDevs].ar = (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND) ? 1 : 0; - if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting(); - - TRACE(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum, - NCCL_IB_LLSTR(portAttr.link_layer), ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar); - - PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail); - ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs); - PTHREADCHECKGOTO(pthread_detach(ncclIbAsyncThread), "pthread_detach", ret, fail); // will not be pthread_join()'d - - // Add this plain physical device to the list of virtual devices - int vDev; - ncclNetVDeviceProps_t vProps = {0}; - vProps.ndevs = 1; - vProps.devs[0] = ncclNIbDevs; - NCCLCHECK(ncclIbMakeVDeviceInternal(&vDev, &vProps)); - - ncclNIbDevs++; - nPorts++; } if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { ret = ncclInternalError; goto fail; } } @@ -779,6 +848,9 @@ ncclResult_t ncclIbGetPhysProperties(int dev, ncclNetProperties_t* props) { props->ptrSupport |= NCCL_PTR_DMABUF; // GDR support via DMA-BUF } props->forceFlush = 0; + if (ibDev->capsProvider.mlx5.dataDirect) { + props->forceFlush = 1; + } props->latency = 0; // Not set props->port = ibDev->portNum + ibDev->realPort; props->maxComms = ibDev->maxQp; @@ -893,6 +965,7 @@ struct ncclProfilerInfo { int qpIndex[MAX_QPS_PER_REQ]; int nEventHandles; ncclProfilerNetIbDescr_v1_t data; + void* pHandle; }; struct ncclIbRequest { @@ -1312,23 +1385,27 @@ ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHan devInfo->gid.global.interface_id = commDev->base.gidInfo.localGid.global.interface_id; // info logging - if (devInfo->link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB - for (int q = 0; q < comm->base.nqps; q++) { - // Print just the QPs for this dev - if (comm->base.qps[q].devIndex == i) + for (int q = 0; q < comm->base.nqps; q++) { + // Print just the QPs for this dev + if (comm->base.qps[q].devIndex == i) { + if (devInfo->link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d LID %d subnet-prefix %lu FLID %d fifoRkey=0x%x fifoLkey=0x%x", - comm->base.vProps.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", - dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, devInfo->lid, - devInfo->gid.global.subnet_prefix, ncclIbExtractFlid(&devInfo->gid), devInfo->fifoRkey, commDev->fifoMr->lkey); - } - } else { // RoCE - for (int q = 0; q < comm->base.nqps; q++) { - // Print just the QPs for this dev - if (comm->base.qps[q].devIndex == i) - INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x} GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x", - comm->base.vProps.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev, - commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask, (int64_t)commDev->base.gidInfo.localGidIndex, - devInfo->gid.global.subnet_prefix, devInfo->gid.global.interface_id, devInfo->fifoRkey, commDev->fifoMr->lkey); + comm->base.vProps.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", + dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, devInfo->lid, + devInfo->gid.global.subnet_prefix, ncclIbExtractFlid(&devInfo->gid), devInfo->fifoRkey, commDev->fifoMr->lkey); + } else { // RoCE + INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x", + comm->base.vProps.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev, + commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, + (int64_t)commDev->base.gidInfo.localGidIndex, + devInfo->gid.global.subnet_prefix, devInfo->gid.global.interface_id, devInfo->fifoRkey, commDev->fifoMr->lkey); + } + // Log ECE info + if (meta.qpInfo[q].ece_supported) { + INFO(NCCL_NET,"NET/IB: IbDev %d Port %d qpn %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x}", + commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, + meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask); + } } } if (link_layer == IBV_LINK_LAYER_UNSPECIFIED) link_layer = devInfo->link_layer; @@ -1406,8 +1483,14 @@ ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHan ncclIbSendCommDev* commDev = comm->devs + devIndex; struct ibv_qp* qp = comm->base.qps[q].qp; - if (remQpInfo->ece_supported) + if (remQpInfo->ece_supported) { + struct ncclIbQp* nqp = comm->base.qps + q; + int ibDevN = comm->devs[nqp->devIndex].base.ibDevN; + struct ncclIbDev* ibDev = ncclIbDevs + ibDevN; + INFO(NCCL_NET,"NET/IB: IbDev %d Port %d qpn %d set_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x}", + ibDevN, ibDev->portNum, qp->qp_num, remMeta.qpInfo[q].ece_supported, remMeta.qpInfo[q].ece.vendor_id, remMeta.qpInfo[q].ece.options, remMeta.qpInfo[q].ece.comp_mask); NCCLCHECKGOTO(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported), ret, fail); + } ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN; remDevInfo->mtu = std::min(remDevInfo->mtu, ibDev->portAttr.active_mtu); @@ -1415,16 +1498,6 @@ ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHan NCCLCHECKGOTO(ncclIbRtsQp(qp), ret, fail); } - if (link_layer == IBV_LINK_LAYER_ETHERNET ) { // RoCE - for (int q = 0; q < comm->base.nqps; q++) { - struct ncclIbQp* qp = comm->base.qps + q; - int ibDevN = comm->devs[qp->devIndex].base.ibDevN; - struct ncclIbDev* ibDev = ncclIbDevs + ibDevN; - INFO(NCCL_NET,"NET/IB: IbDev %d Port %d qpn %d set_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x}", - ibDevN, ibDev->portNum, remMeta.qpInfo[q].qpn, remMeta.qpInfo[q].ece_supported, remMeta.qpInfo[q].ece.vendor_id, remMeta.qpInfo[q].ece.options, remMeta.qpInfo[q].ece.comp_mask); - } - } - comm->base.nDataQps = std::max(comm->base.vProps.ndevs, comm->base.nRemDevs); comm->base.ready = 1; @@ -1750,9 +1823,8 @@ ncclResult_t ncclIbGetRequest(struct ncclIbNetCommBase* base, struct ncclIbReque if (r->type == NCCL_NET_IB_REQ_UNUSED) { r->base = base; r->sock = NULL; - r->devBases[0] = NULL; - r->devBases[1] = NULL; - r->events[0] = r->events[1] = 0; + memset(r->devBases, 0, sizeof(r->devBases)); + memset(r->events, 0, sizeof(r->events)); *req = r; return ncclSuccess; } @@ -1789,7 +1861,11 @@ ncclResult_t ncclIbRegMrDmaBufInternal(ncclIbNetCommDevBase* base, void* data, s if (ncclIbRelaxedOrderingEnabled) flags |= IBV_ACCESS_RELAXED_ORDERING; if (fd != -1) { /* DMA-BUF support */ - NCCLCHECKGOTO(wrap_ibv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags), res, returning); + if (!ncclIbDevs[base->ibDevN].capsProvider.mlx5.dataDirect) { + NCCLCHECKGOTO(wrap_ibv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags), res, returning); + } else { + NCCLCHECKGOTO(wrap_mlx5dv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags, MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT), res, returning); + } } else { if (ncclIbRelaxedOrderingEnabled) { // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support @@ -1897,7 +1973,7 @@ ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) { NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 0); -ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot, void* pHandle) { +ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { struct ncclIbRequest** reqs = comm->fifoReqs[slot]; volatile struct ncclIbSendFifo* slots = comm->fifo[slot]; int nreqs = slots[0].nreqs; @@ -1989,19 +2065,21 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot, void* pHandl struct ibv_send_wr* bad_wr; #ifdef NCCL_ENABLE_NET_PROFILING // QP profiling loop - for (int r=0; rpInfo[0].nEventHandles; - reqs[r]->pInfo[0].qpIndex[nEventHandles%MAX_QPS_PER_REQ] = qpIndex; + assert(nEventHandles < MAX_QPS_PER_REQ); + reqs[r]->pInfo[0].qpIndex[nEventHandles] = qpIndex; // Store info for profiler - int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER; + int64_t pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER; reqs[r]->pInfo[0].data.type = ncclProfileQp; reqs[r]->pInfo[0].data.qp.device = devIndex; reqs[r]->pInfo[0].data.qp.wr_id = comm->wrs[r].wr_id; reqs[r]->pInfo[0].data.qp.opcode = comm->wrs[r].opcode; reqs[r]->pInfo[0].data.qp.qpNum = qp->qp->qp_num; reqs[r]->pInfo[0].data.qp.length = comm->sges[r].length; - NCCLCHECK(ncclProfilerFunction(&reqs[r]->pInfo[0].qpEventHandles[nEventHandles%MAX_QPS_PER_REQ], 0, pHandle, pluginId, &reqs[r]->pInfo[0].data)); + void* pHandle = reqs[r]->pInfo[0].pHandle; + NCCLCHECK(ncclProfilerFunction(&reqs[r]->pInfo[0].qpEventHandles[nEventHandles], ncclProfilerNetEventStart, pHandle, pluginId, &reqs[r]->pInfo[0].data)); reqs[r]->pInfo[0].nEventHandles++; } #endif @@ -2023,8 +2101,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot, void* pHandl ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) { struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm; - if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0"); return ncclInternalError; } - if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; } + if (comm->base.ready == 0) { + WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0"); + *request = NULL; + return ncclInternalError; + } NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__)); struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle; @@ -2065,6 +2146,9 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void* req->send.size = size; req->send.data = data; req->send.offset = 0; +#ifdef NCCL_ENABLE_NET_PROFILING + req->pInfo[0].pHandle = phandle; +#endif // Populate events int nEvents = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.nDataQps; @@ -2094,7 +2178,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void* } TIME_START(0); - NCCLCHECK(ncclIbMultiSend(comm, slot, phandle)); + NCCLCHECK(ncclIbMultiSend(comm, slot)); // Clear slots[0]->nreqs, as well as other fields to help debugging and sanity checks memset((void*)slots, 0, sizeof(struct ncclIbSendFifo)); @@ -2187,8 +2271,11 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; - if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0"); return ncclInternalError; } - if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; } + if (comm->base.ready == 0) { + WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0"); + *request = NULL; + return ncclInternalError; + } if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError; NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__)); @@ -2222,14 +2309,17 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base); #ifdef NCCL_ENABLE_NET_PROFILING // Start a QP event for every request in the multirecv and every qp - for (int r = 0; r < n && phandles; r++) { + for (int r = 0; r < n; r++) { + int nEventHandles = req->pInfo[r].nEventHandles; + assert(nEventHandles < MAX_QPS_PER_REQ); + req->pInfo[r].qpIndex[nEventHandles] = comm->base.qpIndex; // Store info for profiler - int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER; + int64_t pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER; req->pInfo[r].data.type = ncclProfileQp; req->pInfo[r].data.qp.device = qp->devIndex; req->pInfo[r].data.qp.wr_id = wr.wr_id; req->pInfo[r].data.qp.qpNum = qp->qp->qp_num; - NCCLCHECK(ncclProfilerFunction(&req->pInfo[r].qpEventHandles[i], 0, phandles[r], pluginId, &req->pInfo[r].data)); + NCCLCHECK(ncclProfilerFunction(&req->pInfo[r].qpEventHandles[nEventHandles], ncclProfilerNetEventStart, phandles[r], pluginId, &req->pInfo[r].data)); req->pInfo[r].nEventHandles++; } #endif @@ -2311,7 +2401,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { sizes[i] = r->recv.sizes[i]; #ifdef NCCL_ENABLE_NET_PROFILING for (int j = 0; j < r->pInfo[i].nEventHandles; j++) { - NCCLCHECK(ncclProfilerFunction(&r->pInfo[i].qpEventHandles[j], 1, NULL, 0, NULL)); + NCCLCHECK(ncclProfilerFunction(&r->pInfo[i].qpEventHandles[j], ncclProfilerNetEventStop, NULL, 0, NULL)); } #endif } @@ -2320,7 +2410,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { sizes[0] = r->send.size; #ifdef NCCL_ENABLE_NET_PROFILING for (int j = 0; j < r->pInfo[0].nEventHandles; j++) { - NCCLCHECK(ncclProfilerFunction(&r->pInfo[0].qpEventHandles[j], 1, NULL, 0, NULL)); + NCCLCHECK(ncclProfilerFunction(&r->pInfo[0].qpEventHandles[j], ncclProfilerNetEventStop, NULL, 0, NULL)); } #endif } @@ -2368,20 +2458,21 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN+1]; - TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%u wr_id=%lu r=%p type=%d events={%d,%d}, i=%d", - ncclSocketToString(&addr, line), wc->status, wc->opcode,wc->byte_len, wc->wr_id, req, req->type, req->events[0], req->events[1], i); + TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%u wr_id=%lu r=%p type=%d events={%d,%d,%d,%d}, i=%d", + ncclSocketToString(&addr, line), wc->status, wc->opcode,wc->byte_len, wc->wr_id, req, req->type, req->events[0], req->events[1], req->events[2], req->events[3], i); #endif if (req && req->type == NCCL_NET_IB_REQ_SEND) { for (int j = 0; j < req->nreqs; j++) { struct ncclIbRequest* sendReq = r->base->reqs+((wc->wr_id >> (j*8)) & 0xff); if ((sendReq->events[i] <= 0)) { - WARN("NET/IB: sendReq(%p)->events={%d,%d}, i=%d, j=%d <= 0", sendReq, sendReq->events[0], sendReq->events[1], i, j); + WARN("NET/IB: sendReq(%p)->events={%d,%d,%d,%d}, i=%d, j=%d <= 0", sendReq, sendReq->events[0], sendReq->events[1], sendReq->events[2], sendReq->events[3], i, j); return ncclInternalError; } sendReq->events[i]--; #ifdef NCCL_ENABLE_NET_PROFILING // Stop Qp event for sendReq - NCCLCHECK(ncclProfilerFunction(&sendReq->pInfo[j].qpEventHandles[getReqQpIndex(sendReq, j, wc->qp_num)], 1, NULL, 0, NULL)); + int qpIndex = getReqQpIndex(sendReq, j, wc->qp_num); + NCCLCHECK(ncclProfilerFunction(&sendReq->pInfo[j].qpEventHandles[qpIndex], ncclProfilerNetEventStop, NULL, 0, NULL)); #endif } } else { @@ -2398,7 +2489,8 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { #ifdef NCCL_ENABLE_NET_PROFILING // Stop Qp event for workFifo for (int j = 0; j < req->nreqs; j++) { - NCCLCHECK(ncclProfilerFunction(&req->pInfo[j].qpEventHandles[getReqQpIndex(req, j, wc->qp_num)], 1, NULL, 0, NULL)); + int qpIndex = getReqQpIndex(req, j, wc->qp_num); + NCCLCHECK(ncclProfilerFunction(&req->pInfo[j].qpEventHandles[qpIndex], ncclProfilerNetEventStop, NULL, 0, NULL)); } #endif } diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc index 8034d95fe..985810c47 100644 --- a/src/transport/net_socket.cc +++ b/src/transport/net_socket.cc @@ -45,7 +45,7 @@ ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction, ncclProfilerCallba if (ncclNetIfs == -1) { char names[MAX_IF_NAME_SIZE*MAX_IFS]; union ncclSocketAddress addrs[MAX_IFS]; - ncclNetIfs = ncclFindInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS); + NCCLCHECK(ncclFindInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS, &ncclNetIfs)); if (ncclNetIfs <= 0) { WARN("NET/Socket : no interface found"); pthread_mutex_unlock(&ncclNetSocketLock); @@ -124,8 +124,9 @@ ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) { #define MAX_SOCKETS 64 #define MAX_THREADS 16 #define MAX_REQUESTS NCCL_NET_MAX_REQUESTS -#define MIN_CHUNKSIZE (64*1024) +NCCL_PARAM(SocketInlineSize, "SOCKET_INLINE", /*128 B=*/1 << 7); +NCCL_PARAM(SocketMinTaskSize, "SOCKET_MIN_TASKSIZE", /*64 kiB=*/1 << 16); NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2); NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2); @@ -171,6 +172,7 @@ struct ncclNetSocketRequest { int op; void* data; int size; + void* inlineData; struct ncclSocket* ctrlSock; int offset; int used; @@ -211,6 +213,7 @@ struct ncclNetSocketComm { int nSocks; int nThreads; int nextSock; + void* inlineData; struct ncclNetSocketRequest requests[MAX_REQUESTS]; pthread_t helperThread[MAX_THREADS]; struct ncclNetSocketThreadResources threadResources[MAX_THREADS]; @@ -241,13 +244,13 @@ void* persistentSocketThread(void *args_) { data.sock.fd = r->sock->fd; data.sock.op = r->op; data.sock.length = r->size; - ncclProfilerFunction(&eHandle[i+j], 0, resource->pInfo->pHandle, NCCL_PROFILER_NET_TYPE_SOCK | 1, &data); + ncclProfilerFunction(&eHandle[i+j], ncclProfilerNetEventStart, resource->pInfo->pHandle, NCCL_PROFILER_NET_TYPE_SOCK | 1, &data); } #endif r->result = ncclSocketProgress(r->op, r->sock, r->data, r->size, &r->offset); if (r->result != ncclSuccess) { #ifdef NCCL_ENABLE_NET_PROFILING - ncclProfilerFunction(&eHandle[i+j], 1, NULL, 0, NULL); + ncclProfilerFunction(&eHandle[i+j], ncclProfilerNetEventStop, NULL, 0, NULL); eHandle[i+j] = NULL; #endif WARN("NET/Socket : socket progress error"); @@ -257,7 +260,7 @@ void* persistentSocketThread(void *args_) { if (r->offset < r->size) repeat = 1; #ifdef NCCL_ENABLE_NET_PROFILING if (repeat == 0) { - ncclProfilerFunction(&eHandle[i+j], 1, NULL, 0, NULL); + ncclProfilerFunction(&eHandle[i+j], ncclProfilerNetEventStop, NULL, 0, NULL); eHandle[i+j] = NULL; } #endif @@ -360,6 +363,7 @@ ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) goto exit; } +#define SOCKET_CTRL_SIZE (sizeof(int)) ncclResult_t ncclNetSocketConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev return ncclInternalError; @@ -401,6 +405,7 @@ ncclResult_t ncclNetSocketConnect(int dev, ncclNetCommConfig_t* config, void* op NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, &i, sizeof(uint8_t), &done)); if (done == 0) return ncclSuccess; } + NCCLCHECK(ncclCalloc(&comm->inlineData, MAX_REQUESTS * (SOCKET_CTRL_SIZE + ncclParamSocketInlineSize()))); *sendComm = comm; return ncclSuccess; } @@ -449,6 +454,7 @@ ncclResult_t ncclNetSocketAccept(void* listenComm, void** recvComm, ncclNetDevic memcpy(rComm->socks+sendSockIdx, sock, sizeof(struct ncclSocket)); free(sock); } + NCCLCHECK(ncclCalloc(&rComm->inlineData, MAX_REQUESTS * (SOCKET_CTRL_SIZE + ncclParamSocketInlineSize()))); *recvComm = rComm; /* reset lComm state */ @@ -470,6 +476,7 @@ ncclResult_t ncclNetSocketGetRequest(struct ncclNetSocketComm* comm, int op, voi r->used = 1; r->comm = comm; r->nSubs = 0; + r->inlineData = (uint8_t*)comm->inlineData + i * (SOCKET_CTRL_SIZE + ncclParamSocketInlineSize()); *req = r; return ncclSuccess; } @@ -520,6 +527,9 @@ ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, struct ncclPro return ncclInternalError; } +// if the dataSize is smaller than the inline size, return the inline size; if not, return 0 to avoid the extra copy. +static int ncclNetSocketInlineSize(int dataSize) { return (dataSize <= ncclParamSocketInlineSize()) ? dataSize : 0; } + ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) { *done = 0; struct ncclNetSocketRequest *r = (struct ncclNetSocketRequest*)request; @@ -527,37 +537,55 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) { WARN("NET/Socket : test called with NULL request"); return ncclInternalError; } - if (r->used == 1) { /* try to send/recv size */ - int data = r->size; + if (r->used == 1) { /* try to send/recv size (+ inline data if any) */ + int msgSize; + uint8_t* msg = (uint8_t*)r->inlineData; + if (r->op == NCCL_SOCKET_SEND) { + // sender side has the right data size, copy size info + inline data to the buffer + int inlineSize = ncclNetSocketInlineSize(r->size); + msgSize = inlineSize + SOCKET_CTRL_SIZE; + memcpy(msg, &r->size, SOCKET_CTRL_SIZE); + if (inlineSize > 0) memcpy(msg + SOCKET_CTRL_SIZE, r->data, inlineSize); + } else { + // receiver side doesn't have the right data size, wait for the sender to send it + int sizeOffset = 0, senderSize = 0; + while (sizeOffset < SOCKET_CTRL_SIZE) { + NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, msg, SOCKET_CTRL_SIZE, &sizeOffset)); + if (sizeOffset == 0) return ncclSuccess; /* not ready yet*/ + } + memcpy(&senderSize, msg, SOCKET_CTRL_SIZE); + if (senderSize > r->size) { + char line[SOCKET_NAME_MAXLEN + 1]; + union ncclSocketAddress addr; + NCCLCHECK(ncclSocketGetAddr(r->ctrlSock, &addr)); + WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in a healthy state, " + "there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks", + ncclSocketToString(&addr, line), senderSize, r->size); + return ncclInvalidUsage; + } + // copy to the data buffer if we have received some inline data already + int receivedInline = sizeOffset - SOCKET_CTRL_SIZE; + if (receivedInline > 0) memcpy(r->data, msg + SOCKET_CTRL_SIZE, receivedInline); + // from the actual size, extract the remaining inline size to be received and redirect the msg buffer to the user data + r->size = senderSize; + msgSize = ncclNetSocketInlineSize(r->size) - receivedInline; + msg = (uint8_t*)r->data + receivedInline; + } int offset = 0; - NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, &data, sizeof(int), &offset)); - - if (offset == 0) return ncclSuccess; /* Not ready -- retry later */ - - // Not sure we could ever receive less than 4 bytes, but just in case ... - if (offset < sizeof(int)) NCCLCHECK(ncclSocketWait(r->op, r->ctrlSock, &data, sizeof(int), &offset)); - - // Check size is less or equal to the size provided by the user - if (r->op == NCCL_SOCKET_RECV && data > r->size) { - char line[SOCKET_NAME_MAXLEN+1]; - union ncclSocketAddress addr; - NCCLCHECK(ncclSocketGetAddr(r->ctrlSock, &addr)); - WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \ - there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks", - ncclSocketToString(&addr, line), data, r->size); - return ncclInvalidUsage; + while (offset < msgSize) { + NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, msg, msgSize, &offset)); + if (offset == 0) return ncclSuccess; /* not ready yet*/ } - r->size = data; - r->offset = 0; - r->used = 2; // done exchanging size - // divide into subtasks - int chunkOffset = 0, i = 0; + // done exchanging sizes, r->size now contains the actual size + r->used = 2; + r->offset = ncclNetSocketInlineSize(r->size); + int chunkOffset = r->offset, i = 0; if (r->comm->nSocks > 0) { - // each request can be divided up to nSocks tasks - int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks)); + // each request can be divided up to nSocks tasks, we use the size left to transfer + int taskSize = std::max((int)ncclParamSocketMinTaskSize(), DIVUP(r->size - r->offset, r->comm->nSocks)); while (chunkOffset < r->size) { - int chunkSize = std::min(taskSize, r->size-chunkOffset); - NCCLCHECK(ncclNetSocketGetTask(r->comm, &r->pInfo, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++)); + int chunkSize = std::min(taskSize, r->size - chunkOffset); + NCCLCHECK(ncclNetSocketGetTask(r->comm, &r->pInfo, r->op, (char*)(r->data) + chunkOffset, chunkSize, r->tasks + i++)); chunkOffset += chunkSize; } } @@ -588,7 +616,7 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) { data.sock.fd = r->ctrlSock->fd; data.sock.op = r->op; data.sock.length = r->size; - ncclProfilerFunction(&r->pInfo.eHandle, 0, r->pInfo.pHandle, NCCL_PROFILER_NET_TYPE_SOCK | 1, &data); + ncclProfilerFunction(&r->pInfo.eHandle, ncclProfilerNetEventStart, r->pInfo.pHandle, NCCL_PROFILER_NET_TYPE_SOCK | 1, &data); } #endif if (r->offset < r->size) { @@ -599,7 +627,7 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) { *done = 1; r->used = 0; #ifdef NCCL_ENABLE_NET_PROFILING - ncclProfilerFunction(&r->pInfo.eHandle, 1, NULL, 0, NULL); + ncclProfilerFunction(&r->pInfo.eHandle, ncclProfilerNetEventStop, NULL, 0, NULL); r->pInfo.eHandle = NULL; #endif } @@ -673,6 +701,7 @@ ncclResult_t ncclNetSocketClose(void* opaqueComm) { NCCLCHECK(ncclSocketReady(&comm->socks[i], &ready)); if (ready) NCCLCHECK(ncclSocketClose(&comm->socks[i])); } + if(comm->inlineData) free(comm->inlineData); free(comm); } return ncclSuccess; diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc index d99f7cb3e..da8d263f1 100644 --- a/src/transport/nvls.cc +++ b/src/transport/nvls.cc @@ -13,6 +13,7 @@ #include "enqueue.h" #include "register.h" #include "transport.h" +#include "register_inline.h" #if CUDART_VERSION >= 12010 @@ -109,7 +110,9 @@ ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAll } ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t ucsize, size_t mcsize) { - CUCHECK(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, ucsize)); + // unbind can trigger RM error if buffer is freed already by users + // however, it is safe to ignore the error, and unbind will succeed anyway + CUCALL(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, ucsize)); CUCHECK(cuMemUnmap(ptr, mcsize)); CUCHECK(cuMemAddressFree(ptr, mcsize)); CUCHECK(cuMemRelease(*mcHandler)); @@ -143,9 +146,9 @@ ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t ucsize, void* ucptr #define NVLS_MEM_ALIGN_SIZE (1 << 21) #define NVLS_NCHANNELS_SM90 16 #define NVLS_NCHANNELS_SM100 32 +#define NVLS_NCHANNELS_SM100_NVL 24 NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 2); -NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", -2); NCCL_PARAM(NvlsChunkSize, "NVLS_CHUNKSIZE", 128*1024); ncclResult_t ncclNvlsInit(struct ncclComm* comm) { @@ -171,12 +174,31 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) { comm->nvlsSupport = 1; } - INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev); if (comm->nvlsSupport) { - int channels = (comm->compCap >= 100) ? NVLS_NCHANNELS_SM100 : NVLS_NCHANNELS_SM90; - if (ncclParamNvlsChannels() >= 0) channels = ncclParamNvlsChannels(); + int channels; + if (comm->compCap >= 100) { + // Use a reduced number of channels for single node/MNNVL domain on Blackwell. + // comm->nNodes is not yet initialized at this point so we need to use other data. + bool multiNode; + if (comm->MNNVL) { + multiNode = (comm->clique.size < comm->nRanks); + } else { + int i; + for (i = 1; i < comm->nRanks; i++) { + if (comm->peerInfo[i].hostHash != comm->peerInfo[0].hostHash) + break; + } + multiNode = (i < comm->nRanks); + } + channels = (multiNode ? NVLS_NCHANNELS_SM100 : NVLS_NCHANNELS_SM100_NVL); + } else { + channels = NVLS_NCHANNELS_SM90; + } + if (comm->config.nvlsCTAs != NCCL_CONFIG_UNDEF_INT) channels = comm->config.nvlsCTAs; comm->nvlsChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, channels)); } + INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d (NVLS_NCHANNELS %d)", + comm->nvlsSupport ? "" : "not ", dev, comm->nvlsChannels); return ncclSuccess; } @@ -242,16 +264,33 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, const CUmemAccessDesc CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)ucptr, ucsize, ucgran, 0U, 0), ret, fail); // Alloc local physical mem for this NVLS group - CUCHECKGOTO(cuMemCreate(ucHandle, ucsize, &ucprop, 0), ret, fail); - CUCHECKGOTO(cuMemMap((CUdeviceptr)*ucptr, ucsize, 0, *ucHandle, 0), ret, fail); - CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, ucsize, desc, 1), ret, fail); - CUDACHECKGOTO(cudaMemset(*ucptr, 0, ucsize), ret, fail); + CUCHECKGOTO(cuMemCreate(ucHandle, ucsize, &ucprop, 0), ret, fail1); + CUCHECKGOTO(cuMemMap((CUdeviceptr)*ucptr, ucsize, 0, *ucHandle, 0), ret, fail2); + CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, ucsize, desc, 1), ret, fail3); + CUDACHECKGOTO(cudaMemset(*ucptr, 0, ucsize), ret, fail3); // intra-node barrier to mitigate the possible hang in cuMulticastBindMem during abort - NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail); + NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail3); // Bind physical memory to the Multicast group // NB: It will block until all ranks have been added to the Group - CUCHECKGOTO(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, ucsize, 0/*flags*/), ret, fail); + // This is where we normally see issues if the system NVLS/Multicast support is broken + { + CUresult err = CUPFN(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, ucsize, 0/*flags*/)); + if (err != CUDA_SUCCESS) { + const char *errStr; \ + (void) pfn_cuGetErrorString(err, &errStr); \ + if (ncclParamNvlsEnable() == 1) { + // Fail the job as NVLS support is not available + WARN("Failed to bind NVLink SHARP (NVLS) Multicast memory of size %ld : CUDA error %d '%s'.\nThis is usually caused by a system or configuration error in the Fabric Manager or NVSwitches.\nDo not force-enable NVLS (NCCL_NVLS_ENABLE=1) if you wish to avoid this error in the future.", ucsize, err, errStr ); + ret = ncclUnhandledCudaError; + } else { + // Continue without NVLS support (returns ncclSuccess) + INFO(NCCL_INIT|NCCL_NVLS, "Failed to bind NVLink SHARP (NVLS) Multicast memory of size %ld : CUDA error %d '%s'. Proceeding without NVLS support.", ucsize, err, errStr); + } + comm->nvlsSupport = comm->nvlsChannels = 0; + goto fail3; + } + } // Map mc virtual address CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)mcptr, mcsize, mcgran, 0U, 0), ret, fail); @@ -263,6 +302,12 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, const CUmemAccessDesc exit: return ret; +fail3: + CUCHECK(cuMemUnmap((CUdeviceptr)*ucptr, ucsize)); +fail2: + CUCHECK(cuMemRelease(*ucHandle)); +fail1: + CUCHECK(cuMemAddressFree((CUdeviceptr)*ucptr, ucsize)); fail: if (allocMcHandle && *mcptr == NULL && *ucptr == NULL) CUCHECK(cuMemRelease(*mcHandle)); goto exit; @@ -291,8 +336,8 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) { nvlsPerRankSize = nChannels * 2 * buffSize; nvlsTotalSize = nvlsPerRankSize * nHeads; - INFO(NCCL_INIT | NCCL_NVLS, "NVLS comm %p headRank %d nHeads %d buffSize %zu nvlsPerRankSize %zu nvlsTotalSize %zu", - comm, headRank, nHeads, buffSize, nvlsPerRankSize, nvlsTotalSize); + INFO(NCCL_INIT | NCCL_NVLS, "NVLS comm %p headRank %d nHeads %d nvlsRanks %d buffSize %zu nvlsPerRankSize %zu nvlsTotalSize %zu", + comm, headRank, nHeads, comm->localRanks, buffSize, nvlsPerRankSize, nvlsTotalSize); NCCLCHECKGOTO(nvlsAllocateMem(comm, &resources->accessDesc, nvlsTotalSize, &resources->ucBuffHandle, &resources->mcBuffHandle, (void**)&resources->ucBuff, (void**)&resources->mcBuff, &resources->buffUCSize, &resources->buffMCSize), res, fail); @@ -338,32 +383,10 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { size_t typeSize; char shmPath[sizeof("/dev/shm/nccl-XXXXXX")]; uintptr_t *nvlsShmem = NULL; - bool nvlsShare = parent && parent->nvlsSupport && parent->config.splitShare; - int nHeads = comm->channels[0].nvls.nHeads; + bool nvlsShare = parent && parent->nvlsSupport && parent->shareResources && parent->localRanks == comm->localRanks; if (comm->nvlsSupport == 0 || comm->nvlsChannels == 0) return ncclSuccess; - if (nvlsShare && parent->channels[0].nvls.nHeads == nHeads) { - for (int ch = 0; ch < nHeads; ++ch) { - bool find = false; - for (int h = 0; h < parent->channels[0].nvls.nHeads; ++h) { - if (comm->nvlsHeads[ch] == parent->nvlsHeads[h]) { - // find the head - find = true; - break; - } - } - if (find == false) { - nvlsShare = false; - goto setup; - } - } - nvlsShare = true; - } else { - nvlsShare = false; - } - -setup: comm->nvlsChunkSize = ncclParamNvlsChunkSize(); if (nvlsShare) { /* reuse NVLS resources */ @@ -387,9 +410,10 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { comm->nvlsResources->inited = false; comm->nvlsResources->refCount = 1; comm->nvlsResources->nChannels = comm->nvlsChannels; + comm->nvlsResources->nHeads = nHeads; resources = comm->nvlsResources; - if (parent && parent->nvlsSupport && parent->config.splitShare) { + if (parent && parent->nvlsSupport && parent->shareResources) { /* ranks on other nodes might share the NVLS resources, we need to cap nvlsChannels * to make sure nvlsChannels match for each rank. */ comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels); @@ -529,9 +553,9 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t if (userBuff) { NCCLCHECKGOTO(ncclRegFind(comm, (void*)userBuff, buffSize, ®Record), ret, fail); if (regRecord) { - CUDACHECKGOTO(cudaPointerGetAttributes(&attr, (void*)regRecord->addr), ret, fail); + CUDACHECKGOTO(cudaPointerGetAttributes(&attr, (void*)regRecord->begAddr), ret, fail); if (attr.type == cudaMemoryTypeDevice) { - size_t regSize = regRecord->pages * comm->regCache.pageSize; + size_t regSize = regRecord->endAddr - regRecord->begAddr; memset(&mcprop, 0, sizeof(CUmulticastObjectProp)); mcprop.numDevices = comm->localRanks; mcprop.handleTypes = ncclCuMemHandleType; @@ -546,8 +570,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t ucprop.requestedHandleTypes = ncclCuMemHandleType; CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail); - CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)®Record->baseAddr, ®Record->baseSize, (CUdeviceptr)regRecord->addr), ret, fail); - if (regRecord->addr % ucgran == 0) { + if (regRecord->begAddr % ucgran == 0) { if (regSize % ucgran != 0) { regRecord->regUCSize = ALIGN_SIZE(regSize, ucgran); } else { @@ -555,7 +578,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t } regRecord->state |= NVLS_REG_POSSIBLE; memcpy(®Data[comm->localRank].reg, regRecord, sizeof(struct ncclReg)); - regData[comm->localRank].offset = userBuff - regRecord->addr; + regData[comm->localRank].offset = userBuff - regRecord->begAddr; } } @@ -595,7 +618,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t // Coverity complains that regRecord could be NULL. That won't in practice be the case because we've already checked // (regData[i].reg.state & NVLS_REG_POSSIBLE) of all local ranks, which would catch it and bail out. // coverity[var_deref_op] - CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, ucsize, 0), ret, fail); + CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->begAddr, ucsize, 0), ret, fail); // Create a VA for the NVLS CUCHECKGOTO(cuMemAddressReserve(®Ptr, mcsize, mcgran, 0U, 0), ret, fail); @@ -610,7 +633,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t regRecord->mcHandle = mcHandle; regRecord->state |= NVLS_REG_COMPLETE; /* get all buffer addresses */ - regRecord->caddrs[comm->localRank] = regRecord->addr; + regRecord->caddrs[comm->localRank] = regRecord->begAddr; NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regRecord->caddrs + comm->localRank, regRecord->caddrs, sizeof(uintptr_t)), ret, fail); /* Although registration is done, we still need to check whether the offsets are same among ranks. */ @@ -642,23 +665,23 @@ static ncclResult_t nvlsRegisterBuffer(struct ncclComm *comm, const void *sendbu if (sendRegRecord) { memcpy(®Data[comm->localRank * 2].reg, sendRegRecord, sizeof(struct ncclReg)); - regData[comm->localRank * 2].offset = (uintptr_t)sendbuff - sendRegRecord->addr; + regData[comm->localRank * 2].offset = (uintptr_t)sendbuff - sendRegRecord->begAddr; } if (recvRegRecord) { memcpy(®Data[comm->localRank * 2 + 1].reg, recvRegRecord, sizeof(struct ncclReg)); - regData[comm->localRank * 2 + 1].offset = (uintptr_t)recvbuff - recvRegRecord->addr; + regData[comm->localRank * 2 + 1].offset = (uintptr_t)recvbuff - recvRegRecord->begAddr; } NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regData + comm->localRank * 2, regData, sizeof(struct localRegData) * 2), ret, fail); /* first check whether all local ranks find their registered buffer */ for (int i = 0; i < comm->localRanks; ++i) { - if ((regData[i * 2].reg.state & NVLS_REG_COMPLETE) == 0 || regData[comm->localRank * 2].reg.caddrs[i] != regData[i * 2].reg.addr) { + if ((regData[i * 2].reg.state & NVLS_REG_COMPLETE) == 0 || regData[comm->localRank * 2].reg.caddrs[i] != regData[i * 2].reg.begAddr) { sendNeedReg = true; } - if ((regData[i * 2 + 1].reg.state & NVLS_REG_COMPLETE) == 0 || regData[comm->localRank * 2 + 1].reg.caddrs[i] != regData[i * 2 + 1].reg.addr) { + if ((regData[i * 2 + 1].reg.state & NVLS_REG_COMPLETE) == 0 || regData[comm->localRank * 2 + 1].reg.caddrs[i] != regData[i * 2 + 1].reg.begAddr) { recvNeedReg = true; } @@ -787,7 +810,7 @@ ncclResult_t ncclNvlsGraphRegisterBuffer( NCCLCHECK(ncclCommGraphRegister(comm, baseRecv, baseRecvSize, (void**)&recvRegRecord)); } - NCCLCHECK(nvlsRegisterBuffer(comm, baseSend, baseRecv, baseSendSize, baseRecvSize, sendRegRecord, recvRegRecord, outRegBufUsed, outRegBufSend, outRegBufRecv)); + NCCLCHECK(nvlsRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, sendRegRecord, recvRegRecord, outRegBufUsed, outRegBufSend, outRegBufRecv)); if (*outRegBufUsed) { if (sendRegRecord) { @@ -815,6 +838,124 @@ ncclResult_t ncclNvlsGraphRegisterBuffer( return ncclSuccess; } +ncclResult_t ncclNvlsSymmetricInit(struct ncclComm* comm) { + ncclResult_t ret = ncclSuccess; + if (comm && comm->nvlsSupport) { + CUmulticastObjectProp mcprop = {}; + CUmemGenericAllocationHandle mcHandle; + char shareableHandle[NVLS_HANDLE_SIZE]; + CUmemAccessDesc accessDesc = {}; + + mcprop.numDevices = comm->localRanks; + mcprop.handleTypes = ncclCuMemHandleType; + mcprop.flags = 0; + mcprop.size = comm->baseStride; + + if (comm->localRank == 0) { + NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &mcHandle, shareableHandle), ret, fail); + NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); + } else { + NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); + NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &mcHandle), ret, fail); + } + + CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->cudaDev), ret, fail); + CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)&comm->baseMCSymPtr, comm->baseStride, NCCL_MAX_PAGE_SIZE, 0, 0), ret, fail); + CUCHECKGOTO(cuMemMap((CUdeviceptr)comm->baseMCSymPtr, comm->baseStride, 0, mcHandle, 0), ret, fail); + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = comm->cudaDev; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)comm->baseMCSymPtr, comm->baseStride, &accessDesc, 1), ret, fail); + comm->symMCHandle = mcHandle; + } +exit: + return ret; +fail: + goto exit; +} + +ncclResult_t ncclNvlsSymmetricFinalize(struct ncclComm* comm) { + ncclResult_t ret = ncclSuccess; + if (comm && comm->nvlsSupport && comm->baseMCSymPtr) { + CUCHECKGOTO(cuMemUnmap((CUdeviceptr)comm->baseMCSymPtr, comm->baseStride), ret, fail); + CUCHECKGOTO(cuMemAddressFree((CUdeviceptr)comm->baseMCSymPtr, comm->baseStride), ret, fail); + CUCHECKGOTO(cuMemRelease(comm->symMCHandle), ret, fail); + } +exit: + return ret; +fail: + goto exit; +} + +ncclResult_t ncclNvlsSymmetricMap(struct ncclComm* comm, size_t offset, size_t ucsize, void* ucaddr) { + ncclResult_t ret = ncclSuccess; + assert((uintptr_t)ucaddr % NCCL_REC_PAGE_SIZE == 0 && ucsize % NCCL_REC_PAGE_SIZE == 0); + if (comm && comm->nvlsSupport && ucaddr && ucsize > 0) { + CUCHECKGOTO(cuMulticastBindAddr(comm->symMCHandle, offset, (CUdeviceptr)ucaddr, ucsize, 0), ret, fail); + INFO(NCCL_ALLOC, "NVLS symmetric alloc mc buffer ptr %p offset %ld UC addr %p UC size %ld symAllocHead %ld", comm->baseMCSymPtr + offset, offset, ucaddr, ucsize, comm->symAllocHead); + } + +exit: + return ret; +fail: + goto exit; +} + +ncclResult_t ncclNvlsSymmetricFree(struct ncclComm* comm, size_t ucsize, void* ucaddr) { + ncclResult_t ret = ncclSuccess; + if (comm && comm->nvlsSupport && ucaddr && ucsize > 0) { + size_t offset = (size_t)ucaddr - ((size_t)comm->baseUCSymPtr + comm->localRank * comm->baseStride); + CUCHECKGOTO(cuMulticastUnbind(comm->symMCHandle, comm->cudaDev, offset, ucsize), ret, fail); + } +exit: + return ret; +fail: + goto exit; +} + +ncclResult_t ncclNvlsRegResourcesQuery(struct ncclComm* comm, struct ncclTaskColl* info, int* recChannels) { + int factor; + ncclResult_t ret = ncclSuccess; + if (comm->nNodes == 1) { + if (info->func == ncclFuncReduceScatter) { + factor = (comm->compCap >= 100 ? 6 : 5) * 8; + *recChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, DIVUP(factor, comm->nvlsResources->nHeads))); + } else if (info->func == ncclFuncAllGather) { + factor = 4 * 8; + *recChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, DIVUP(factor, comm->nvlsResources->nHeads))); + } else if (info->func == ncclFuncAllReduce) { + if (comm->compCap >= 100) { + factor = 8 * 8; + } else { + factor = 4 * 8; + } + *recChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, DIVUP(factor, comm->nvlsResources->nHeads))); + } else { + goto fail; + } + } else { + // Further tweaks for Blackwell with NVLS registered buffers + if (info->func == ncclFuncReduceScatter) { + factor = (comm->bandwidths[ncclFuncReduceScatter][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] > 400 ? 7 : 6) * 8; + *recChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, DIVUP(factor, comm->nvlsResources->nHeads))); + } else if (info->func == ncclFuncAllGather) { + factor = 6 * 8; + *recChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, DIVUP(factor, comm->nvlsResources->nHeads))); + } else if (info->func == ncclFuncAllReduce) { + factor = (comm->compCap >= 100 ? 7 : 6) * 8; + *recChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, DIVUP(factor, comm->nvlsResources->nHeads))); + } else { + goto fail; + } + } + +exit: + return ret; +fail: + ret = ncclInvalidArgument; + goto exit; +} + #else /* @@ -860,4 +1001,25 @@ ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHa return ncclSuccess; } +ncclResult_t ncclNvlsSymmetricInit(struct ncclComm* comm) { + return ncclSuccess; +} + +ncclResult_t ncclNvlsSymmetricMap(struct ncclComm* comm, size_t offset, size_t ucsize, void* ucaddr) { + return ncclSuccess; +} + +ncclResult_t ncclNvlsSymmetricFree(struct ncclComm* comm, size_t ucsize, void* ucaddr) { + return ncclSuccess; +} + +ncclResult_t ncclNvlsSymmetricFinalize(struct ncclComm* comm) { + return ncclSuccess; +} + +ncclResult_t ncclNvlsRegResourcesQuery(struct ncclComm* comm, struct ncclTaskColl* info, int* recChannels) { + *recChannels = 0; + return ncclSuccess; +} + #endif /* CUDA_VERSION >= 12010 */ diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index aed84c588..d263dda3a 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -12,6 +12,7 @@ #include "transport.h" #include #include "shm.h" +#include "register_inline.h" enum p2pType { P2P_DIRECT, P2P_INTERMEDIATE, P2P_IPC, P2P_CUMEM }; @@ -826,7 +827,7 @@ ncclResult_t ret = ncclSuccess; // We already have IPC info for peerLocalRank, no need to register it, we can reuse it *regBufFlag = 1; if (isLegacyIpc) *isLegacyIpc = regRecord->ipcInfos[peerLocalRank]->impInfo.legacyIpcCap; - INFO(NCCL_REG, "rank %d - IPC reuse buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, peerRank, regRecord->ipcInfos[peerLocalRank]->impInfo.rmtRegAddr); + INFO(NCCL_REG, "rank %d - IPC reuse buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p", comm->rank, userbuff, buffSize, (void*)regRecord->begAddr, regRecord->endAddr - regRecord->begAddr, peerRank, regRecord->ipcInfos[peerLocalRank]->impInfo.rmtRegAddr); } else { // Register buffer with peerLocalRank struct ncclProxyConnector* proxyConn = NULL; @@ -885,11 +886,11 @@ ncclResult_t ret = ncclSuccess; void* rmtRegAddr = NULL; ipcInfo.size = baseSize; - ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr; + ipcInfo.offset = regRecord->begAddr - (uintptr_t)baseAddr; // Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side // and get the remote register address back. if (proxyConn) { - INFO(NCCL_REG, "rank %d - IPC registering buffer %p size %ld (baseAddr %p size %ld) to peer %d", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank); + INFO(NCCL_REG, "rank %d - IPC registering buffer %p size %ld (baseAddr %p size %ld) to peer %d", comm->rank, userbuff, buffSize, (void*)regRecord->begAddr, ipcInfo.size, peerRank); NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail); } if (rmtRegAddr) { @@ -909,7 +910,7 @@ ncclResult_t ret = ncclSuccess; regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr; needUpdate = true; *regBufFlag = 1; - INFO(NCCL_REG, "rank %d - IPC registered buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr); + INFO(NCCL_REG, "rank %d - IPC register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->begAddr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->begAddr); } } } @@ -935,7 +936,7 @@ ncclResult_t ret = ncclSuccess; // p2p always returns remote addr here since remote buffer addr is passed in ncclDevWorkP2p struct peerRmtAddrs = (uintptr_t*)regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank]; } - *offsetOut = (uintptr_t)userbuff - regRecord->addr; + *offsetOut = (uintptr_t)userbuff - regRecord->begAddr; *peerRmtAddrsOut = peerRmtAddrs; } } @@ -1117,6 +1118,88 @@ static ncclResult_t p2pProxyDeregister(struct ncclProxyConnection* connection, s goto exit; } +ncclResult_t ncclIpcSymmetricInit(struct ncclComm* comm) { + CUCHECK(cuMemAddressReserve((CUdeviceptr*)&comm->baseUCSymPtr, comm->baseStride * comm->localRanks, NCCL_MAX_PAGE_SIZE, 0, 0)); + return ncclSuccess; +} + +ncclResult_t ncclIpcSymmetricFinalize(struct ncclComm* comm) { + if (comm->baseUCSymPtr) { + CUCHECK(cuMemAddressFree((CUdeviceptr)comm->baseUCSymPtr, comm->baseStride * comm->localRanks)); + } + return ncclSuccess; +} + +ncclResult_t ncclIpcSymmetricMap(struct ncclComm* comm, size_t offset, size_t size, CUmemGenericAllocationHandle memHandle, void** symPtr) { + ncclResult_t ret = ncclSuccess; + CUmemGenericAllocationHandle impHandle; + int impFd = -1; + ncclCuDesc* desc = NULL; + CUmemAccessDesc accessDesc = {}; + + assert(offset % NCCL_REC_PAGE_SIZE == 0 && size % NCCL_REC_PAGE_SIZE == 0); + NCCLCHECKGOTO(ncclCalloc(&desc, comm->localRanks), ret, fail); + if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { + memcpy(&desc[comm->localRank].data, &memHandle, sizeof(CUmemGenericAllocationHandle)); + } else { + CUCHECKGOTO(cuMemExportToShareableHandle(&desc[comm->localRank].handle, memHandle, ncclCuMemHandleType, 0), ret, fail); + } + + NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, desc, sizeof(ncclCuDesc)), ret, fail); + + // start mapping + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = comm->cudaDev; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + for (int r = 0; r < comm->localRanks; ++r) { + CUdeviceptr maddr; + if (r == comm->localRank) { + impHandle = memHandle; + } else { + if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { + impFd = -1; + NCCLCHECKGOTO(ncclProxyClientGetFdBlocking(comm, comm->localRankToRank[r], &desc[r].data, &impFd), ret, fail); + CUCHECKGOTO(cuMemImportFromShareableHandle(&impHandle, (void*)(uintptr_t)impFd, ncclCuMemHandleType), ret, fail); + SYSCHECKGOTO(close(impFd), "close", ret, fail); + } else { + CUCHECKGOTO(cuMemImportFromShareableHandle(&impHandle, (void*)&desc[r].handle, ncclCuMemHandleType), ret, fail); + } + } + maddr = (CUdeviceptr)(comm->baseUCSymPtr + (size_t)r * comm->baseStride + offset); + CUCHECKGOTO(cuMemMap(maddr, size, 0, impHandle, 0), ret, fail); + CUCHECKGOTO(cuMemSetAccess(maddr, size, &accessDesc, 1), ret, fail); + + if (r == comm->localRank) { + *symPtr = (void*)maddr; + } else { + CUCHECKGOTO(cuMemRelease(impHandle), ret, fail); + } + } + + INFO(NCCL_ALLOC, "IPC symmetric alloc buffer %p offset %ld size %ld symAllocHead %ld", *symPtr, offset, size, comm->symAllocHead); + +exit: + free(desc); + return ret; +fail: + goto exit; +} + +ncclResult_t ncclIpcSymmetricFree(struct ncclComm* comm, size_t size, void* symPtr) { + ncclResult_t ret = ncclSuccess; + if (comm && symPtr && size > 0) { + size_t offset = (size_t)symPtr - ((size_t)comm->baseUCSymPtr + comm->localRank * comm->baseStride); + for (int r = 0; r < comm->localRanks; ++r) { + CUdeviceptr peerAddr = (CUdeviceptr)(comm->baseUCSymPtr + r * comm->baseStride + offset); + CUCHECKGOTO(cuMemUnmap(peerAddr, size), ret, fail); + } + } +exit: + return ret; +fail: + goto exit; +} + struct ncclTransport p2pTransport = { "P2P", p2pCanConnect, diff --git a/src/transport/profiler.cc b/src/transport/profiler.cc index 3e32843aa..6e7b33c16 100644 --- a/src/transport/profiler.cc +++ b/src/transport/profiler.cc @@ -6,6 +6,7 @@ #include "transport.h" #include "proxy.h" #include "profiler.h" +#include "device.h" static ncclResult_t profilerProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { connection->proxyAppendPtr = &connection->proxyAppend; @@ -29,15 +30,15 @@ static ncclResult_t profilerProxyProgress(struct ncclProxyState* proxyState, str if (args->state == ncclProxyOpProgress) { for (int s = 0; s < args->nsubs; s++) { struct ncclProxySubArgs* sub = args->subs + s; - uint64_t* workStarted = (uint64_t *)sub->sendbuff; - uint64_t* workCompleted = (uint64_t *)sub->recvbuff; - if (sub->posted < sub->nsteps && sub->base <= workStarted[sub->channelId]) { - ncclProfilerStartKernelChEvent(args, s); + struct ncclDevProfiler* workStarted = (struct ncclDevProfiler *)sub->sendbuff; + struct ncclDevProfiler* workCompleted = (struct ncclDevProfiler *)sub->recvbuff; + if (sub->posted < sub->nsteps && sub->base <= workStarted[sub->channelId].data[sub->base%MAX_PROFILER_EVENTS_PER_CHANNEL].counter) { + ncclProfilerStartKernelChEvent(args, s, workStarted[sub->channelId].data[sub->base%MAX_PROFILER_EVENTS_PER_CHANNEL].timestamp); sub->posted = sub->nsteps; continue; // allow events on every channel to start } - if (sub->transmitted < sub->nsteps && sub->base <= workCompleted[sub->channelId]) { - ncclProfilerStopKernelChEvent(args, s); + if (sub->transmitted < sub->nsteps && sub->base <= workCompleted[sub->channelId].data[sub->base%MAX_PROFILER_EVENTS_PER_CHANNEL].counter) { + ncclProfilerStopKernelChEvent(args, s, workCompleted[sub->channelId].data[sub->base%MAX_PROFILER_EVENTS_PER_CHANNEL].timestamp); sub->transmitted = sub->nsteps; args->done++; } diff --git a/src/transport/shm.cc b/src/transport/shm.cc index aa3e6c41b..993570da2 100644 --- a/src/transport/shm.cc +++ b/src/transport/shm.cc @@ -10,7 +10,7 @@ #include "transport.h" #define SHM_PATH_MAX 128 -#define SHM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR +#define SHM_HANDLE_TYPE ncclCuMemHandleType struct shmBuffInfo { void *hptr; From 3ea7eedf3b9b94f1d9f99f4e55536dfcbd23c1ca Mon Sep 17 00:00:00 2001 From: Kamil Iskra Date: Wed, 18 Jun 2025 10:34:47 -0700 Subject: [PATCH 12/21] NCCL 2.27.5-1 Improvements for GB200 systems * Optimize the network performance by alternating the direction of the rings and the NIC to GPU assignment across communicators to limit unnecessary sharing. * Fix the detection of C2C links in case GPU Direct RDMA is disabled between a GPU and a NIC. * Fix PXN support on MNNVL systems, where NCCL would try (and fail) to share regular host memory across multiple nodes. * Fix P2C (PXN over C2C), which is now preferred over regular PXN. This support is currently preliminary and is disabled by default; use NCCL_PXN_C2C=1 to enable. Further reduce the overheads of CUDA graph capturing, which increased in NCCL 2.26.2 for large graphs. Optimize the network performance on DGX B200 systems by adjusting the bandwidths provided to the graph search algorithm. Enable fp8 reductions in symmetric kernels on Blackwell with CUDA 12.8. Restore the plugin name handling logic to make it possible to specify a path to the plugin (Issue #1732). Restore the ability to change NCCL_COLLNET_ENABLE during execution (Issue #1741). Add an example tuner plugin with CSV-based overrides. Remove an x86 dependency from the example profiler. --- ext-net/example/Makefile | 21 +- ext-profiler/example/Makefile | 20 +- ext-profiler/example/plugin.c | 22 +- ext-tuner/basic/Makefile | 23 + ext-tuner/basic/nccl/common.h | 15 + ext-tuner/basic/nccl/err.h | 17 + ext-tuner/basic/nccl/tuner.h | 97 ++ ext-tuner/basic/plugin.c | 34 + ext-tuner/example/Makefile | 54 +- ext-tuner/example/README.md | 164 ++++ ext-tuner/example/nccl_tuner.conf | 45 + ext-tuner/example/plugin.c | 433 ++++++++- ext-tuner/example/scripts/README.md | 106 +++ ext-tuner/example/scripts/optimize_config.py | 430 +++++++++ .../scripts/sample_performance_data.csv | 24 + ext-tuner/example/test/Makefile | 30 + ext-tuner/example/test/README.md | 205 +++++ ext-tuner/example/test/test_plugin.c | 856 ++++++++++++++++++ makefiles/common.mk | 10 +- makefiles/version.mk | 2 +- src/device/Makefile | 5 +- src/device/reduce_kernel.h | 2 +- src/device/symmetric/generate.py | 4 +- src/graph/paths.cc | 30 +- src/graph/search.cc | 120 ++- src/graph/topo.cc | 15 +- src/graph/topo.h | 11 +- src/graph/tuning.cc | 11 +- src/init.cc | 14 +- src/misc/mlx5dvsymbols.cc | 3 + src/misc/strongstream.cc | 28 +- src/plugin/plugin_open.cc | 26 +- src/transport/net_ib.cc | 6 +- 33 files changed, 2740 insertions(+), 143 deletions(-) create mode 100644 ext-tuner/basic/Makefile create mode 100644 ext-tuner/basic/nccl/common.h create mode 100644 ext-tuner/basic/nccl/err.h create mode 100644 ext-tuner/basic/nccl/tuner.h create mode 100644 ext-tuner/basic/plugin.c create mode 100644 ext-tuner/example/README.md create mode 100644 ext-tuner/example/nccl_tuner.conf create mode 100644 ext-tuner/example/scripts/README.md create mode 100644 ext-tuner/example/scripts/optimize_config.py create mode 100644 ext-tuner/example/scripts/sample_performance_data.csv create mode 100644 ext-tuner/example/test/Makefile create mode 100644 ext-tuner/example/test/README.md create mode 100644 ext-tuner/example/test/test_plugin.c diff --git a/ext-net/example/Makefile b/ext-net/example/Makefile index e0a6aa619..9cc623e31 100644 --- a/ext-net/example/Makefile +++ b/ext-net/example/Makefile @@ -3,15 +3,20 @@ # # See LICENSE.txt for license information # -NCCL_HOME:=../../build/ -CUDA_HOME:=/usr/local/cuda -INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl -PLUGIN_SO:=libnccl-net.so +.DEFAULT_GOAL: build +include ../../makefiles/common.mk +SRCDIR ?= $(abspath ../..) +BUILDDIR ?= . +NCCLDIR := $(BUILDDIR) -default: $(PLUGIN_SO) +SRC_FILES := $(wildcard *.c) -$(PLUGIN_SO): plugin.c - $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ +build: ${BUILDDIR}/libnccl-net-example.so + +${BUILDDIR}/libnccl-net-example.so: ${SRC_FILES} + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${BUILDDIR} + $(CC) -Inccl -fPIC -shared -o $@ $^ clean: - rm -f $(PLUGIN_SO) + rm -f ${BUILDDIR}/libnccl-net-example.so diff --git a/ext-profiler/example/Makefile b/ext-profiler/example/Makefile index f5cc9f1d8..777ff5bad 100644 --- a/ext-profiler/example/Makefile +++ b/ext-profiler/example/Makefile @@ -3,14 +3,20 @@ # # See LICENSE.txt for license information # -NCCL_HOME := ../../build -INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl -PLUGIN_SO := libnccl-profiler.so +.DEFAULT_GOAL: build +include ../../makefiles/common.mk +SRCDIR ?= $(abspath ../..) +BUILDDIR ?= . +NCCLDIR := $(BUILDDIR) -default: $(PLUGIN_SO) +SRC_FILES := $(wildcard *.c) -$(PLUGIN_SO): plugin.c event.c print_event.c - $(CXX) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ +build: ${BUILDDIR}/libnccl-profiler-example.so + +${BUILDDIR}/libnccl-profiler-example.so: ${SRC_FILES} + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${BUILDDIR} + $(CC) -Inccl -fPIC -shared -o $@ $^ clean: - rm -f $(PLUGIN_SO) + rm -f ${BUILDDIR}/libnccl-profiler-example.so diff --git a/ext-profiler/example/plugin.c b/ext-profiler/example/plugin.c index e3f707a0a..b89cd4627 100644 --- a/ext-profiler/example/plugin.c +++ b/ext-profiler/example/plugin.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include "event.h" #include "print_event.h" @@ -41,22 +41,10 @@ static struct proxyOp* detachPool; ncclDebugLogger_t logFn; #define INFO(FLAGS, ...) logFn(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) -static double freq = -1; -__hidden void calibrate() { - struct timeval tv; - gettimeofday(&tv, NULL); - uint64_t timeCycles = __rdtsc(); - double time = - tv.tv_sec*1e6 - tv.tv_usec; - uint64_t total = 0ULL; - for (int i = 0; i < 10000; i++) total += __rdtsc(); - gettimeofday(&tv, NULL); - timeCycles = __rdtsc() - timeCycles; - time += tv.tv_sec*1e6 + tv.tv_usec; - freq = timeCycles / time; -} - __hidden double gettime(void) { - return __rdtsc() / freq; + struct timespec t; + clock_gettime(CLOCK_MONOTONIC, &t); + return (t.tv_sec*1e6 + (t.tv_nsec*1e-3)); } static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; @@ -98,8 +86,6 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask, // process address space. pid = getpid(); - // calibrate and start timer - calibrate(); startTime = gettime(); } pthread_mutex_unlock(&lock); diff --git a/ext-tuner/basic/Makefile b/ext-tuner/basic/Makefile new file mode 100644 index 000000000..50edd23a7 --- /dev/null +++ b/ext-tuner/basic/Makefile @@ -0,0 +1,23 @@ +# +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# +.DEFAULT_GOAL: build +include ../../makefiles/common.mk +SRCDIR ?= $(abspath ../..) +BUILDDIR ?= . +NCCLDIR := $(BUILDDIR) + +SRC_FILES := $(wildcard *.c) +DST_DIR := $(BUILDDIR)/test/unit/plugins + +build: ${BUILDDIR}/libnccl-tuner-basic.so + +${BUILDDIR}/libnccl-tuner-basic.so: ${SRC_FILES} + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${BUILDDIR} + $(CC) -Inccl -fPIC -shared -o $@ $^ + +clean: + rm -f ${BUILDDIR}/libnccl-tuner-basic.so diff --git a/ext-tuner/basic/nccl/common.h b/ext-tuner/basic/nccl/common.h new file mode 100644 index 000000000..912925225 --- /dev/null +++ b/ext-tuner/basic/nccl/common.h @@ -0,0 +1,15 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef COMMON_H_ +#define COMMON_H_ + +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; + +typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); + +#endif diff --git a/ext-tuner/basic/nccl/err.h b/ext-tuner/basic/nccl/err.h new file mode 100644 index 000000000..bb92e8354 --- /dev/null +++ b/ext-tuner/basic/nccl/err.h @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NCCL_ERR_H_ +#define NCCL_ERR_H_ + +/* Error type for plugins */ +typedef enum { ncclSuccess = 0, + ncclUnhandledCudaError = 1, + ncclSystemError = 2, + ncclInternalError = 3, + ncclInvalidArgument = 4, + ncclInvalidUsage = 5, + ncclRemoteError = 6 } ncclResult_t; + +#endif diff --git a/ext-tuner/basic/nccl/tuner.h b/ext-tuner/basic/nccl/tuner.h new file mode 100644 index 000000000..77b543d12 --- /dev/null +++ b/ext-tuner/basic/nccl/tuner.h @@ -0,0 +1,97 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_TUNER_H_ +#define NCCL_TUNER_H_ + +#include +#include + +#include "common.h" +#include "err.h" + +#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now +typedef enum { + ncclFuncBroadcast = 0, + ncclFuncReduce = 1, + ncclFuncAllGather = 2, + ncclFuncReduceScatter = 3, + ncclFuncAllReduce = 4, + ncclFuncSendRecv = 5, + ncclFuncSend = 6, + ncclFuncRecv = 7, + ncclNumFuncs = 8 +} ncclFunc_t; + +#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet* +#define NCCL_ALGO_UNDEF -1 +#define NCCL_ALGO_TREE 0 +#define NCCL_ALGO_RING 1 +#define NCCL_ALGO_COLLNET_DIRECT 2 +#define NCCL_ALGO_COLLNET_CHAIN 3 +#define NCCL_ALGO_NVLS 4 +#define NCCL_ALGO_NVLS_TREE 5 +#define NCCL_ALGO_PAT 6 + +#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 +#define NCCL_PROTO_UNDEF -1 +#define NCCL_PROTO_LL 0 +#define NCCL_PROTO_LL128 1 +#define NCCL_PROTO_SIMPLE 2 + +#define NCCL_ALGO_PROTO_IGNORE -1.0 + +// API to be implemented by external tuner +typedef struct { + // Name of the tuner + const char* name; + + // Initializes tuner states. + // Inputs: + // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. + // - nNodes: number of nodes in current communicator. + // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. + // Outputs: + // - context: tuner context object + ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); + + // Gets info (algo, protocol, number of ctas and threads) for a given collective. + // Inputs: + // - context: tuner context object + // - collType: collective type , e.g., allreduce, allgather… + // - nBytes: collective size in bytes + // - numPipeOps: number of operations in the group + // - numAlgo: number of algorithms in collCostTable + // - numProto: number of protocols in collCostTable + // - regBuff: can register user buffer + // + // Outputs: + // - nChannels: number of channels (hence SMs) to be used. + // + // InOut: + // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. + // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). + // + // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the + // default tuning for the given collective. + // Also, the plugin is allowed to not set any output, or set only the + // algorithm and protocol, but not only the algorithm or only the protocol. + // Unset fields will be set automatically by NCCL. + ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, + int numPipeOps, float** collCostTable, int numAlgo, int numProto, + int regBuff, int* nChannels); + + // Terminates the plugin and cleans up any resources that the plugin allocated. + // context: tuner context object + ncclResult_t (*destroy)(void* context); +} ncclTuner_v4_t; + +typedef ncclTuner_v4_t ncclTuner_t; + +#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4" + +#endif diff --git a/ext-tuner/basic/plugin.c b/ext-tuner/basic/plugin.c new file mode 100644 index 000000000..a17fd009e --- /dev/null +++ b/ext-tuner/basic/plugin.c @@ -0,0 +1,34 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "tuner.h" + +#define __hidden __attribute__ ((visibility("hidden"))) + +__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; } + +__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes, + int numPipeOps, float** collCostTable, int numAlgo, int numProto, + int regBuff, int* nChannels) { + // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo + float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; + if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) { + table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0; + } + *nChannels = 1; + return ncclSuccess; +} + +__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; } + +#define PLUGIN_NAME "Basic" + +const ncclTuner_v4_t ncclTunerPlugin_v4 = { + .name = PLUGIN_NAME, + .init = pluginInit, + .getCollInfo = pluginGetCollInfo, + .destroy = pluginDestroy +}; diff --git a/ext-tuner/example/Makefile b/ext-tuner/example/Makefile index 9d9ace484..76c16b60f 100644 --- a/ext-tuner/example/Makefile +++ b/ext-tuner/example/Makefile @@ -3,15 +3,53 @@ # # See LICENSE.txt for license information # -NCCL_HOME:=../../build/ -CUDA_HOME:=/usr/local/cuda -INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl -PLUGIN_SO:=libnccl-tuner.so -default: $(PLUGIN_SO) +.DEFAULT_GOAL: build +PLUGIN_SO:=libnccl-tuner-example.so +include ../../makefiles/common.mk +SRCDIR ?= $(abspath ../..) +BUILDDIR ?= . +NCCLDIR := $(BUILDDIR) -$(PLUGIN_SO): plugin.c - $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ +SRC_FILES := $(wildcard *.c) +DST_DIR := $(BUILDDIR)/test/unit/plugins + +default: ${BUILDDIR}/$(PLUGIN_SO) + +build: ${BUILDDIR}/$(PLUGIN_SO) + +${BUILDDIR}/$(PLUGIN_SO): plugin.c + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${BUILDDIR} + $(CC) -Inccl $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ + +# Test targets - delegate to test directory +test: + $(MAKE) -C test test TEST_CASE=$(TEST_CASE) + +test-verbose: + $(MAKE) -C test test-verbose TEST_CASE=$(TEST_CASE) + +# Build tests +test-build: + $(MAKE) -C test all + +# Optimize configurations from performance data +optimize-config: + @if [ -z "$(CSV_FILE)" ]; then \ + echo "Usage: make optimize-config CSV_FILE=path/to/data.csv [OUTPUT=config.conf] [METRIC=latency_us]"; \ + echo "Example: make optimize-config CSV_FILE=scripts/sample_performance_data.csv"; \ + exit 1; \ + fi + python3 scripts/optimize_config.py $(CSV_FILE) \ + $(if $(OUTPUT),-o $(OUTPUT)) \ + $(if $(METRIC),-m $(METRIC)) \ + $(if $(SIZE_RANGES),--size-ranges $(SIZE_RANGES)) \ + $(if $(DRY_RUN),--dry-run) \ + $(if $(NO_HEADER),--no-header) clean: - rm -f $(PLUGIN_SO) + rm -f ${BUILDDIR}/$(PLUGIN_SO) + $(MAKE) -C test clean + +.PHONY: test test-verbose test-build optimize-config clean diff --git a/ext-tuner/example/README.md b/ext-tuner/example/README.md new file mode 100644 index 000000000..7f472ae7a --- /dev/null +++ b/ext-tuner/example/README.md @@ -0,0 +1,164 @@ +# NCCL Example Tuner Plugin + +This example plugin shows a practical example of a CSV file-based tuning approach, allowing selective overrides for tuning parameters based on all tuning inputs without recompiling. + +## Features + +- **File-based Configuration**: Read tuning parameters from a CSV configuration file +- **Size-based Tuning**: Specify different configurations based on message size ranges +- **Dimension-aware Tuning**: Match configurations based on number of nodes and ranks +- **Optional Channels Configuration**: Set specific channel counts or use -1 to keep NCCL's default +- **Environment Variable Support**: Specify config file location via `NCCL_TUNER_CONFIG_FILE` +- **Fallback Behavior**: Gracefully handles missing config files and invalid entries + +## Building + +```bash +make +``` + +This will create `libnccl-tuner-example.so` that can be loaded by NCCL. + +## Configuration File Format + +The configuration file uses CSV (Comma-Separated Values) format with one configuration per line: + +``` +collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff +``` + +### Parameters + +- **collective_type**: The collective operation type + - `broadcast`, `reduce`, `allgather`, `reducescatter`, `allreduce` + +- **min_bytes/max_bytes**: The message size range (in bytes) for which this config applies + - Use `0` for minimum and `4294967295` for maximum (covers all sizes) + +- **algorithm**: The NCCL algorithm to use + - `tree`, `ring`, `collnet_direct`, `collnet_chain`, `nvls`, `nvls_tree`, `pat` + +- **protocol**: The NCCL protocol to use + - `ll`, `ll128`, `simple` + +- **channels**: Number of channels (SMs) to use + - Use a positive integer to specify exact channel count + - Use `-1` to keep NCCL's default channel selection + +- **nNodes**: Number of nodes to match + - Use a positive integer to match specific node count + - Use `-1` to match any number of nodes + +- **nRanks**: Number of ranks to match + - Use a positive integer to match specific rank count + - Use `-1` to match any number of ranks + +- **numPipeOps**: Number of pipeline operations to match (optional) + - Use a positive integer to match specific pipeline operation count + - Use `-1` to match any number of pipeline operations + - If omitted, configuration will match any numPipeOps value + +- **regBuff**: Whether user buffer can be registered (optional) + - Use `0` to match only non-registered buffers + - Use `1` to match only registered buffers + - Use `-1` to match either registered or non-registered buffers + - If omitted, configuration will match any regBuff value + +### Example Configuration + +```csv +# Single-node, small allreduce: use tree algorithm, registered buffers only +allreduce,0,65536,tree,simple,2,1,-1,-1,1 + +# 4-node, 32-rank setup: medium allreduce, single pipeline op, non-registered buffers +allreduce,65537,1048576,ring,simple,4,4,32,1,0 + +# Any topology: large allreduce with LL128, multiple pipeline ops, any buffer type +allreduce,1048577,4294967295,ring,ll128,-1,-1,-1,4,-1 + +# Single-node broadcast: prefer tree, any pipeOps, registered buffers (backward compatible) +broadcast,0,32768,tree,simple,-1,1,-1 + +# Multi-node broadcast: optimized for non-registered buffers, single pipeline op +broadcast,32769,4294967295,ring,simple,2,-1,-1,1,0 +``` + +Comments start with `#` and empty lines are ignored. The CSV format makes it easy to edit configurations in spreadsheet applications like Excel, Google Sheets, or LibreOffice Calc. + +### Backward Compatibility + +Configurations without the numPipeOps and/or regBuff parameters are fully supported: +- 8 fields: matches any numPipeOps and regBuff values +- 9 fields: matches any regBuff value +- 10 fields: full parameter specification + +This ensures existing configuration files continue to work without modification. + +## Usage + +### Method 1: Default Config File +Place your configuration in `nccl_tuner.conf` in the current working directory. + +### Method 2: Environment Variable +Set the `NCCL_TUNER_CONFIG_FILE` environment variable to specify the config file path: + +```bash +export NCCL_TUNER_CONFIG_FILE=/path/to/your/tuner.conf +export LD_LIBRARY_PATH=/path/to/plugin:$LD_LIBRARY_PATH +mpirun -np 4 your_nccl_application +``` + +## Editing Configuration Files + +### Generating Configuration Files from Raw Data + +A python script to generate valid CSV configs has been provided. [Using optimize_config.py](scripts/README.md). + +### Spreadsheet Tips: +- Use column headers: `collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff` +- Save as CSV format (not Excel format) for the plugin to read +- Use data validation to prevent typos in algorithm/protocol names + +## Logging + +The plugin uses NCCL's logging system. To see tuner-related messages: + +```bash +export NCCL_DEBUG=INFO +``` + +This will show when configurations are loaded and applied, including the topology information. + +For detailed debugging output during tuning decisions: + +```bash +export NCCL_DEBUG=TRACE +``` + +This will show verbose information about which configurations are being evaluated and matched. + +## Dimension Matching + +Configurations are only applied when the topology matches: + +- **Exact Match**: Configuration specifies `nNodes=4,nRanks=32`, only applied when communicator has exactly 4 nodes and 32 ranks +- **Wildcard Nodes**: Configuration specifies `nNodes=-1,nRanks=8`, applied to any topology with exactly 8 ranks +- **Wildcard Ranks**: Configuration specifies `nNodes=2,nRanks=-1`, applied to any 2-node topology regardless of ranks per node +- **Wildcard Both**: Configuration specifies `nNodes=-1,nRanks=-1`, applied to any topology + +This allows you to create specialized configurations for different cluster setups while maintaining flexibility. + +## Default Behavior + +If no configuration file is found or no matching configuration exists for a collective operation, the plugin falls back to preferring the ring algorithm with simple protocol. All configured algorithm/protocol combinations are given a low cost (0.0) to make them preferred by NCCL's selection logic. + +When channels is set to `-1`, NCCL's default channel selection logic is preserved, allowing the system to automatically determine the optimal number of channels based on hardware and message size. + +## Troubleshooting + +1. **Config file not found**: Check the file path and permissions +2. **Configurations not applied**: Verify the collective type, size ranges, algorithm/protocol names, and topology parameters +3. **Plugin not loaded**: Ensure `LD_LIBRARY_PATH` includes the plugin directory +4. **No effect on performance**: Check that NCCL is actually using the tuner plugin with `NCCL_DEBUG=INFO` +5. **Topology mismatch**: Verify that nNodes and nRanks match your actual setup, or use -1 for wildcards +6. **CSV parsing errors**: Ensure no spaces after commas, or quote fields containing spaces diff --git a/ext-tuner/example/nccl_tuner.conf b/ext-tuner/example/nccl_tuner.conf new file mode 100644 index 000000000..13eb2f081 --- /dev/null +++ b/ext-tuner/example/nccl_tuner.conf @@ -0,0 +1,45 @@ +# NCCL Tuner Configuration File (CSV Format) +# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff +# +# Collective types: broadcast, reduce, allgather, reducescatter, allreduce +# Algorithms: tree, ring, collnet_direct, collnet_chain, nvls, nvls_tree, pat +# Protocols: ll, ll128, simple +# Channels: number of channels to use, or -1 to keep default +# nNodes: number of nodes to match, or -1 for any number of nodes +# nRanks: number of ranks to match, or -1 for any number of ranks +# numPipeOps: number of pipeline operations to match, or -1 for any number (optional) +# regBuff: whether user buffer can be registered (0=no, 1=yes, -1=any) (optional) +# +# Note: numPipeOps and regBuff parameters are optional - configurations without them will match any value +# +# Examples: + +# For single-node configurations with registered buffers +# Small allreduce operations on single node - use tree algorithm, registered buffers +allreduce,0,65536,tree,simple,2,1,-1,-1,1 + +# For multi-node configurations with 4 nodes, 32 total ranks, single pipeline op, non-registered buffers +# Medium allreduce operations - use ring algorithm +allreduce,65537,1048576,ring,simple,4,4,32,1,0 + +# For any topology - large allreduce operations with LL128 protocol, multiple pipeline ops, any buffer type +allreduce,1048577,4294967295,ring,ll128,-1,-1,-1,4,-1 + +# Broadcast operations - different configs for different topologies, pipeline complexity, and buffer types +# Single node broadcast - prefer tree, any pipeOps, registered buffers only +broadcast,0,32768,tree,simple,-1,1,-1,-1,1 + +# Multi-node broadcast with single pipeline operation, non-registered buffers - use ring +broadcast,32769,4294967295,ring,simple,2,-1,-1,1,0 + +# AllGather operations - optimized for 2-node configurations, any pipeOps, any buffer type +allgather,0,4294967295,ring,simple,4,2,-1 + +# ReduceScatter operations +# Small messages on single node, single pipeline op, registered buffers +reducescatter,0,131072,tree,simple,2,1,-1,1,1 +# Large messages on any topology, multiple pipeline ops, non-registered buffers +reducescatter,131073,4294967295,ring,simple,-1,-1,-1,2,0 + +# Reduce operations - any topology, keep default channels, any pipeOps, any buffer type +reduce,0,4294967295,tree,simple,-1,-1,-1 diff --git a/ext-tuner/example/plugin.c b/ext-tuner/example/plugin.c index 7925dcfa1..1b8031ed1 100644 --- a/ext-tuner/example/plugin.c +++ b/ext-tuner/example/plugin.c @@ -5,24 +5,443 @@ ************************************************************************/ #include "tuner.h" +#include +#include +#include #define __hidden __attribute__ ((visibility("hidden"))) +#define MAX_LINE_LENGTH 256 -__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; } +// CSV field indices for configuration parsing +// Format: colltype,minbytes,maxbytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff +#define CONFIG_FIELD_COLLTYPE 0 +#define CONFIG_FIELD_MINBYTES 1 +#define CONFIG_FIELD_MAXBYTES 2 +#define CONFIG_FIELD_ALGORITHM 3 +#define CONFIG_FIELD_PROTOCOL 4 +#define CONFIG_FIELD_CHANNELS 5 +#define CONFIG_FIELD_NNODES 6 +#define CONFIG_FIELD_NRANKS 7 +#define CONFIG_FIELD_PIPEOPS 8 // Optional field +#define CONFIG_FIELD_REGBUFF 9 // Optional field + +// Field count constants +#define CONFIG_FIELDS_REQUIRED 8 // Minimum required fields (up to nRanks) +#define CONFIG_FIELDS_WITH_PIPEOPS 9 // Fields including numPipeOps +#define CONFIG_FIELDS_WITH_REGBUFF 10 // Fields including both numPipeOps and regBuff +#define CONFIG_FIELDS_MAX 10 // Maximum number of fields supported + +typedef struct { + ncclFunc_t collType; + size_t minBytes; + size_t maxBytes; + int algorithm; + int protocol; + int nChannels; + int nNodes; + int nRanks; + int numPipeOps; + int regBuff; +} TuningConfig; + +typedef struct { + TuningConfig* configs; // Changed from static array to dynamic pointer + int numConfigs; + int maxConfigs; // Added to track allocated size + size_t nRanks; + size_t nNodes; + ncclDebugLogger_t logFunction; +} TunerContext; + +// Parse collective type from string +static ncclFunc_t parseCollType(const char* str) { + if (strcmp(str, "broadcast") == 0) return ncclFuncBroadcast; + if (strcmp(str, "reduce") == 0) return ncclFuncReduce; + if (strcmp(str, "allgather") == 0) return ncclFuncAllGather; + if (strcmp(str, "reducescatter") == 0) return ncclFuncReduceScatter; + if (strcmp(str, "allreduce") == 0) return ncclFuncAllReduce; + return ncclFuncAllReduce; // default +} + +// Convert collective type to string +static const char* collTypeToString(ncclFunc_t collType) { + switch (collType) { + case ncclFuncBroadcast: return "broadcast"; + case ncclFuncReduce: return "reduce"; + case ncclFuncAllGather: return "allgather"; + case ncclFuncReduceScatter: return "reducescatter"; + case ncclFuncAllReduce: return "allreduce"; + default: return "unknown"; + } +} + +// Parse algorithm from string +static int parseAlgorithm(const char* str) { + if (strcmp(str, "tree") == 0) return NCCL_ALGO_TREE; + if (strcmp(str, "ring") == 0) return NCCL_ALGO_RING; + if (strcmp(str, "collnet_direct") == 0) return NCCL_ALGO_COLLNET_DIRECT; + if (strcmp(str, "collnet_chain") == 0) return NCCL_ALGO_COLLNET_CHAIN; + if (strcmp(str, "nvls") == 0) return NCCL_ALGO_NVLS; + if (strcmp(str, "nvls_tree") == 0) return NCCL_ALGO_NVLS_TREE; + if (strcmp(str, "pat") == 0) return NCCL_ALGO_PAT; + return NCCL_ALGO_RING; // default +} + +// Convert algorithm to string +static const char* algorithmToString(int algorithm) { + switch (algorithm) { + case NCCL_ALGO_TREE: return "tree"; + case NCCL_ALGO_RING: return "ring"; + case NCCL_ALGO_COLLNET_DIRECT: return "collnet_direct"; + case NCCL_ALGO_COLLNET_CHAIN: return "collnet_chain"; + case NCCL_ALGO_NVLS: return "nvls"; + case NCCL_ALGO_NVLS_TREE: return "nvls_tree"; + case NCCL_ALGO_PAT: return "pat"; + default: return "unknown"; + } +} + +// Parse protocol from string +static int parseProtocol(const char* str) { + if (strcmp(str, "ll") == 0) return NCCL_PROTO_LL; + if (strcmp(str, "ll128") == 0) return NCCL_PROTO_LL128; + if (strcmp(str, "simple") == 0) return NCCL_PROTO_SIMPLE; + return NCCL_PROTO_SIMPLE; // default +} + +// Convert protocol to string +static const char* protocolToString(int protocol) { + switch (protocol) { + case NCCL_PROTO_LL: return "ll"; + case NCCL_PROTO_LL128: return "ll128"; + case NCCL_PROTO_SIMPLE: return "simple"; + default: return "unknown"; + } +} + +// Helper function to count valid configuration lines in file +static int countConfigLines(const char* filename) { + FILE* file = fopen(filename, "r"); + if (!file) { + return 0; + } + + char line[MAX_LINE_LENGTH]; + int count = 0; + + while (fgets(line, sizeof(line), file)) { + // Skip comments and empty lines + if (line[0] == '#' || line[0] == '\n') continue; + + // Remove trailing newline + line[strcspn(line, "\n")] = 0; + + // Check if line has content + if (strlen(line) > 0) { + count++; + } + } + + fclose(file); + return count; +} + +// Load configuration from file +static ncclResult_t loadConfig(TunerContext* ctx, const char* filename) { + FILE* file = fopen(filename, "r"); + if (!file) { + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Config file %s not found, using defaults", filename); + } + return ncclSuccess; // Not finding config file is not an error + } + + // First pass: count valid configuration lines + int configCount = countConfigLines(filename); + if (configCount == 0) { + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: No valid configurations found in %s", filename); + } + fclose(file); + return ncclSuccess; + } + + // Allocate memory for configurations based on actual count + ctx->configs = (TuningConfig*)malloc(configCount * sizeof(TuningConfig)); + if (!ctx->configs) { + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Failed to allocate memory for %d configurations", configCount); + } + fclose(file); + return ncclSystemError; + } + + ctx->maxConfigs = configCount; + ctx->numConfigs = 0; + + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Allocated memory for %d configurations", configCount); + } + + // Reset file pointer to beginning + fseek(file, 0, SEEK_SET); + + char line[MAX_LINE_LENGTH]; + + while (fgets(line, sizeof(line), file) && ctx->numConfigs < ctx->maxConfigs) { + // Skip comments and empty lines + if (line[0] == '#' || line[0] == '\n') continue; + + // Remove trailing newline + line[strcspn(line, "\n")] = 0; + + // Parse CSV format: colltype,minbytes,maxbytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff + char* token; + char* tokens[CONFIG_FIELDS_MAX]; + int tokenCount = 0; + + // Make a copy of the line for tokenizing + char lineCopy[MAX_LINE_LENGTH]; + strncpy(lineCopy, line, sizeof(lineCopy)); + lineCopy[sizeof(lineCopy) - 1] = '\0'; + + // Tokenize by comma + token = strtok(lineCopy, ","); + while (token != NULL && tokenCount < CONFIG_FIELDS_MAX) { + // Trim whitespace + while (*token == ' ' || *token == '\t') token++; + char* end = token + strlen(token) - 1; + while (end > token && (*end == ' ' || *end == '\t')) { + *end = '\0'; + end--; + } + tokens[tokenCount++] = token; + token = strtok(NULL, ","); + } + + // Validate field count: support required fields (8), with pipeOps (9), or with regBuff (10) + if (tokenCount >= CONFIG_FIELDS_REQUIRED && tokenCount <= CONFIG_FIELDS_MAX) { + TuningConfig* config = &ctx->configs[ctx->numConfigs]; + config->collType = parseCollType(tokens[CONFIG_FIELD_COLLTYPE]); + config->minBytes = (size_t)strtoull(tokens[CONFIG_FIELD_MINBYTES], NULL, 10); + config->maxBytes = (size_t)strtoull(tokens[CONFIG_FIELD_MAXBYTES], NULL, 10); + config->algorithm = parseAlgorithm(tokens[CONFIG_FIELD_ALGORITHM]); + config->protocol = parseProtocol(tokens[CONFIG_FIELD_PROTOCOL]); + config->nChannels = atoi(tokens[CONFIG_FIELD_CHANNELS]); + config->nNodes = atoi(tokens[CONFIG_FIELD_NNODES]); + config->nRanks = atoi(tokens[CONFIG_FIELD_NRANKS]); + + // numPipeOps is optional (9th field, index 8) + if (tokenCount >= CONFIG_FIELDS_WITH_PIPEOPS) { + config->numPipeOps = atoi(tokens[CONFIG_FIELD_PIPEOPS]); + } else { + config->numPipeOps = -1; // -1 means match any numPipeOps + } + + // regBuff is optional (10th field, index 9) + if (tokenCount >= CONFIG_FIELDS_WITH_REGBUFF) { + config->regBuff = atoi(tokens[CONFIG_FIELD_REGBUFF]); + } else { + config->regBuff = -1; // -1 means match any regBuff value + } + + ctx->numConfigs++; + + if (ctx->logFunction) { + if (config->numPipeOps == -1 && config->regBuff == -1) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=any regBuff=any", + tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes, + tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL], + config->nChannels, config->nNodes, config->nRanks); + } else if (config->regBuff == -1) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=%d regBuff=any", + tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes, + tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL], + config->nChannels, config->nNodes, config->nRanks, config->numPipeOps); + } else if (config->numPipeOps == -1) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=any regBuff=%d", + tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes, + tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL], + config->nChannels, config->nNodes, config->nRanks, config->regBuff); + } else { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=%d regBuff=%d", + tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes, + tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL], + config->nChannels, config->nNodes, config->nRanks, config->numPipeOps, config->regBuff); + } + } + } + } + + fclose(file); + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Loaded %d tuning configurations from %s", ctx->numConfigs, filename); + } + return ncclSuccess; +} + +__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { + TunerContext* ctx = (TunerContext*)malloc(sizeof(TunerContext)); + if (!ctx) return ncclSystemError; + + ctx->configs = NULL; // Initialize to NULL + ctx->numConfigs = 0; + ctx->maxConfigs = 0; // Initialize to 0 + ctx->nRanks = nRanks; + ctx->nNodes = nNodes; + ctx->logFunction = logFunction; + + if (logFunction) { + logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks", nNodes, nRanks); + } + + // Try to load config file from environment variable or default location + const char* configFile = getenv("NCCL_TUNER_CONFIG_FILE"); + if (!configFile) { + configFile = "nccl_tuner.conf"; // default config file name + } + + ncclResult_t result = loadConfig(ctx, configFile); + if (result != ncclSuccess) { + if (ctx->configs) { + free(ctx->configs); // Clean up allocated memory on error + } + free(ctx); + return result; + } + + *context = ctx; + return ncclSuccess; +} __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int regBuff, int* nChannels) { - // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo - float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; - if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) { - table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0; - } + TunerContext* ctx = (TunerContext*)context; + if (!ctx) return ncclInternalError; + + // Default channels *nChannels = 1; + + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: pluginGetCollInfo called - collType=%s, nBytes=%zu, numPipeOps=%d, regBuff=%d, numConfigs=%d", + collTypeToString(collType), nBytes, numPipeOps, regBuff, ctx->numConfigs); + } + + // Look for matching configuration + for (int i = 0; i < ctx->numConfigs; i++) { + TuningConfig* config = &ctx->configs[i]; + + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Checking config %d - collType=%s, minBytes=%zu, maxBytes=%zu, algo=%s, proto=%s, nNodes=%d, nRanks=%d, numPipeOps=%d, regBuff=%d", + i, collTypeToString(config->collType), config->minBytes, config->maxBytes, algorithmToString(config->algorithm), protocolToString(config->protocol), + config->nNodes, config->nRanks, config->numPipeOps, config->regBuff); + } + + // Check if this config matches the current collective, size range, topology, pipeline ops, and regBuff + if (config->collType == collType && + nBytes >= config->minBytes && + nBytes <= config->maxBytes && + (config->nNodes == -1 || config->nNodes == (int)ctx->nNodes) && + (config->nRanks == -1 || config->nRanks == (int)ctx->nRanks) && + (config->numPipeOps == -1 || config->numPipeOps == numPipeOps) && + (config->regBuff == -1 || config->regBuff == regBuff)) { + + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Config matches. Applying algo=%s, proto=%s, channels=%d", + algorithmToString(config->algorithm), protocolToString(config->protocol), config->nChannels); + } + + // Check bounds + if (config->algorithm < numAlgo && config->protocol < numProto) { + if (collCostTable[config->algorithm][config->protocol] != NCCL_ALGO_PROTO_IGNORE) { + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Setting cost table[%s][%s] (%p) = 0.0 (was %.1f)", + algorithmToString(config->algorithm), protocolToString(config->protocol), + &collCostTable[config->algorithm][config->protocol], collCostTable[config->algorithm][config->protocol]); + } + collCostTable[config->algorithm][config->protocol] = 0.0; // Set low cost to prefer this configuration + + // Only override channels if not set to -1 (keep default) + if (config->nChannels != -1) { + *nChannels = config->nChannels; + } + + if (ctx->logFunction) { + if (config->nChannels == -1) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Applied config for collType=%s, bytes=%zu, pipeOps=%d, regBuff=%d: algo=%s, proto=%s, channels=default (nodes=%d, ranks=%d)", + collTypeToString(config->collType), nBytes, numPipeOps, regBuff, algorithmToString(config->algorithm), protocolToString(config->protocol), + config->nNodes, config->nRanks); + } else { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Applied config for collType=%s, bytes=%zu, pipeOps=%d, regBuff=%d: algo=%s, proto=%s, channels=%d (nodes=%d, ranks=%d)", + collTypeToString(config->collType), nBytes, numPipeOps, regBuff, algorithmToString(config->algorithm), protocolToString(config->protocol), + config->nChannels, config->nNodes, config->nRanks); + } + } + return ncclSuccess; + } else { + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Algorithm/protocol combination [%s][%s] is marked as IGNORE", + algorithmToString(config->algorithm), protocolToString(config->protocol)); + } + } + } else { + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Algorithm/protocol out of bounds - algo=%s (max %d), proto=%s (max %d)", + algorithmToString(config->algorithm), numAlgo, protocolToString(config->protocol), numProto); + } + } + } else { + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: Config does not match - collType match=%d, size match=%d, nodes match=%d, ranks match=%d, pipeOps match=%d, regBuff match=%d", + config->collType == collType, + (nBytes >= config->minBytes && nBytes <= config->maxBytes), + (config->nNodes == -1 || config->nNodes == (int)ctx->nNodes), + (config->nRanks == -1 || config->nRanks == (int)ctx->nRanks), + (config->numPipeOps == -1 || config->numPipeOps == numPipeOps), + (config->regBuff == -1 || config->regBuff == regBuff)); + } + } + } + + // If no specific config found, apply default behavior + if (ctx->logFunction) { + ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, + "TUNER/ExamplePlugin: No matching config found"); + } + return ncclSuccess; } -__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; } +__hidden ncclResult_t pluginDestroy(void* context) { + if (context) { + TunerContext* ctx = (TunerContext*)context; + if (ctx->configs) { + free(ctx->configs); // Free dynamically allocated configs array + } + free(context); + } + return ncclSuccess; +} #define PLUGIN_NAME "Example" diff --git a/ext-tuner/example/scripts/README.md b/ext-tuner/example/scripts/README.md new file mode 100644 index 000000000..d31de4354 --- /dev/null +++ b/ext-tuner/example/scripts/README.md @@ -0,0 +1,106 @@ +# NCCL Tuner Configuration Scripts + +This directory contains scripts for optimizing NCCL tuner configurations based on performance data. + +## optimize_config.py + +A Python script that reads performance data from CSV files and generates optimal NCCL tuner configurations. + +### Usage + +```bash +python scripts/optimize_config.py [options] +``` + +### Options + +- `-o, --output FILE`: Output NCCL tuner config file (default: `nccl_tuner.conf`) +- `-m, --metric METRIC`: Optimization metric (`cost_metric`, `bandwidth_gbps`, `latency_us`) +- `--no-header`: Don't add header comments to output file +- `--dry-run`: Print configurations without writing to file + +### CSV Input Format + +The input CSV file should have the following columns: + +```csv +collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,cost_metric,bandwidth_gbps,latency_us +``` + +**Required columns:** +- `collective`: NCCL collective type (`allreduce`, `broadcast`, `reduce`, etc.) +- `size_bytes`: Message size in bytes +- `algorithm`: NCCL algorithm (`tree`, `ring`, `nvls`, etc.) +- `protocol`: NCCL protocol (`simple`, `ll`, `ll128`) +- `channels`: Number of channels (or `-1` for default) +- `nodes`: Number of nodes (or `-1` for any) +- `ranks`: Number of ranks (or `-1` for any) +- `pipeOps`: Number of pipeline operations (or `-1` for any) +- `regBuff`: Registered buffer flag (`0`, `1`, or `-1` for any) + +**Optional metrics (must have at least one present):** +- `bandwidth_gbps`: Bandwidth in GB/s (higher is better) +- `latency_us`: Latency in microseconds (lower is better) + +### Examples + +**Basic usage with cost optimization:** +```bash +python scripts/optimize_config.py sample_performance_data.csv +``` + +**Optimize for bandwidth and write to custom file:** +```bash +python scripts/optimize_config.py -m bandwidth_gbps -o my_tuner.conf performance_data.csv +``` + +**Preview configurations without writing:** +```bash +python scripts/optimize_config.py --dry-run performance_data.csv +``` + +### How It Works + +1. **Data Loading**: Reads CSV performance data and validates format +2. **Grouping**: Groups data by collective type, topology (nodes/ranks), and other parameters +3. **Size Ranges**: Automatically bins data into size ranges for optimization +4. **Optimization**: Finds the best performing configuration for each group/size combination +5. **Output**: Generates NCCL tuner config format and appends to specified file + +### Default Size Ranges + +The script uses these default size ranges (in bytes): +- Small: 0 - 1,024 +- Medium: 1,025 - 65,536 +- Large: 65,537 - 1,048,576 +- XLarge: 1,048,577 - 16,777,216 +- XXLarge: 16,777,217 - 4,294,967,295 + +### Sample Data + +See `sample_performance_data.csv` for an example of the expected input format. + +### Integration with NCCL + +The generated configuration file can be used directly with the NCCL tuner plugin: + +```bash +export NCCL_TUNER_CONFIG_FILE=/path/to/optimized_config.conf +export NCCL_TUNER_PLUGIN=/path/to/libnccl-tuner.so +mpirun -np 8 your_nccl_application +``` + +### Performance Data Collection + +To collect performance data for optimization, you can: + +1. **Use NCCL benchmarks** with different algorithm/protocol combinations +2. **Profile your applications** with various tuner settings +3. **Run systematic sweeps** across parameter combinations +4. **Use NCCL debug output** to collect timing information + +The key is to have comprehensive data covering: +- Different message sizes (small to large) +- Various topologies (single node, multi-node) +- All relevant algorithm/protocol combinations +- Different channel counts and pipeline configurations diff --git a/ext-tuner/example/scripts/optimize_config.py b/ext-tuner/example/scripts/optimize_config.py new file mode 100644 index 000000000..c5c9b7085 --- /dev/null +++ b/ext-tuner/example/scripts/optimize_config.py @@ -0,0 +1,430 @@ +#!/usr/bin/env python3 +""" +NCCL Tuner Configuration Optimizer + +Reads a CSV file containing performance data across different tuning parameters +and generates optimal NCCL tuner configurations based on the best performing +combinations. + +By default, creates growing size ranges that interpolate between the actual data sizes +for each unique dimension (node count, rank count combination). This ensures that +different cluster configurations get their own optimized size boundaries, as +performance characteristics often vary significantly between topologies. + +Each dimension gets its own set of ranges starting from 0 and extending to the maximum +size for that dimension, with boundaries at midpoints between consecutive data sizes. + +CSV Input Format: +collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,bandwidth_gbps,latency_us + +Output Format (NCCL Tuner Config): +collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff + +Usage Examples: + # Auto-create dimension-specific interpolated ranges (default) + python3 optimize_config.py data.csv + + # Use custom size ranges (applied to all topologies) + python3 optimize_config.py data.csv --size-ranges "0-1024,1025-65536,65537-1048576" + + # Use hardcoded default ranges (applied to all topologies) + python3 optimize_config.py data.csv --no-auto-ranges +""" + +import csv +import argparse +import sys +import os +from collections import defaultdict +from typing import Dict, List, Tuple, Any + +class PerformanceData: + def __init__(self, row: Dict[str, str]): + self.collective = row['collective'] + self.size_bytes = int(row['size_bytes']) + self.algorithm = row['algorithm'] + self.protocol = row['protocol'] + self.channels = int(row['channels']) if row['channels'] != '-1' else -1 + self.nodes = int(row['nodes']) if row['nodes'] != '-1' else -1 + self.ranks = int(row['ranks']) if row['ranks'] != '-1' else -1 + self.pipeOps = int(row['pipeOps']) if row['pipeOps'] != '-1' else -1 + self.regBuff = int(row['regBuff']) if row['regBuff'] != '-1' else -1 + + # Performance metrics + self.bandwidth_gbps = float(row.get('bandwidth_gbps', 0)) # Higher is better + self.latency_us = float(row.get('latency_us', 0)) # Lower is better + + def get_config_key(self) -> Tuple: + """Generate a key for grouping similar configurations""" + return (self.collective, self.nodes, self.ranks, self.pipeOps, self.regBuff) + + def get_size_range_key(self, topology_size_ranges: Dict[Tuple[int, int], List[Tuple[int, int]]]) -> Tuple[int, int]: + """Find which size range this data point belongs to for its dimension""" + topology_key = (self.nodes, self.ranks) + + # Get size ranges for this dimension, or fall back to default + if topology_key in topology_size_ranges: + size_ranges = topology_size_ranges[topology_key] + elif (-1, -1) in topology_size_ranges: + size_ranges = topology_size_ranges[(-1, -1)] + else: + # Fallback to first available dimension ranges + size_ranges = next(iter(topology_size_ranges.values())) + + for min_size, max_size in size_ranges: + if min_size <= self.size_bytes <= max_size: + return (min_size, max_size) + # If no range found, create a single-point range + return (self.size_bytes, self.size_bytes) + +class ConfigOptimizer: + def __init__(self, optimization_metric: str = 'latency_us'): + self.optimization_metric = optimization_metric + # Default size ranges - will be overridden by auto-detection + self.size_ranges = [ + (0, 1024), + (1025, 64*1024), + (64*1024+1, 1024*1024), + (1024*1024+1, 16*1024*1024), + (16*1024*1024+1, 4*1024*1024*1024-1) + ] + self.auto_size_ranges = True + + def set_size_ranges(self, ranges: List[Tuple[int, int]]): + """Set custom size ranges for optimization""" + self.size_ranges = ranges + self.auto_size_ranges = False + + def auto_determine_size_ranges(self, data: List[PerformanceData]) -> Dict[Tuple[int, int], List[Tuple[int, int]]]: + """Create growing size ranges for each unique (nodes, ranks) dimension""" + if not data: + return {(-1, -1): self.size_ranges} + + # Group data by dimension (nodes, ranks) + topology_data = defaultdict(list) + for item in data: + topology_key = (item.nodes, item.ranks) + topology_data[topology_key].append(item) + + topology_ranges = {} + + for topology_key, items in topology_data.items(): + nodes, ranks = topology_key + + # Extract unique sizes for this dimension and sort them + unique_sizes = sorted(set(item.size_bytes for item in items)) + + if len(unique_sizes) <= 1: + # Only one size, create a single range from 0 to that size + size = unique_sizes[0] if unique_sizes else 0 + ranges = [(0, size)] + else: + # Create growing ranges that interpolate between data points + ranges = [] + + for i, size in enumerate(unique_sizes): + if i == 0: + # First range: 0 to midpoint between first and second size + if len(unique_sizes) > 1: + next_size = unique_sizes[i + 1] + max_size = (size + next_size) // 2 + else: + max_size = size + min_size = 0 + elif i == len(unique_sizes) - 1: + # Last range: previous max + 1 to current size (and beyond) + min_size = ranges[-1][1] + 1 + max_size = size + else: + # Intermediate ranges: previous max + 1 to midpoint with next size + min_size = ranges[-1][1] + 1 + next_size = unique_sizes[i + 1] + max_size = (size + next_size) // 2 + + ranges.append((min_size, max_size)) + + topology_ranges[topology_key] = ranges + + print(f"Dimension {nodes} nodes, {ranks} ranks: {len(ranges)} size ranges from {len(unique_sizes)} unique sizes:") + for i, (min_size, max_size) in enumerate(ranges): + # Count data points that fall in this range for this dimension + count = sum(1 for item in items if min_size <= item.size_bytes <= max_size) + actual_sizes = sorted(set(item.size_bytes for item in items if min_size <= item.size_bytes <= max_size)) + if actual_sizes: + size_list = ', '.join(f"{s:,}" for s in actual_sizes[:3]) + if len(actual_sizes) > 3: + size_list += f", ... (+{len(actual_sizes)-3} more)" + print(f" Range {i+1}: {min_size:,} - {max_size:,} bytes ({count} data points, sizes: {size_list})") + + return topology_ranges + + def load_data(self, csv_file: str) -> List[PerformanceData]: + """Load performance data from CSV file""" + data = [] + try: + with open(csv_file, 'r') as f: + reader = csv.DictReader(f) + for row in reader: + try: + data.append(PerformanceData(row)) + except (ValueError, KeyError) as e: + print(f"Warning: Skipping invalid row: {row} - {e}") + except FileNotFoundError: + print(f"Error: File {csv_file} not found") + sys.exit(1) + except Exception as e: + print(f"Error reading {csv_file}: {e}") + sys.exit(1) + + print(f"Loaded {len(data)} performance data points") + + # Auto-determine size ranges if enabled + if self.auto_size_ranges and data: + self.topology_size_ranges = self.auto_determine_size_ranges(data) + else: + # Use default ranges for all topologies + self.topology_size_ranges = {(-1, -1): self.size_ranges} + + return data + + def is_better(self, new_data: PerformanceData, current_best: PerformanceData) -> bool: + """Determine if new_data is better than current_best""" + if self.optimization_metric == 'bandwidth_gbps': + return new_data.bandwidth_gbps > current_best.bandwidth_gbps + elif self.optimization_metric == 'latency_us': + return new_data.latency_us < current_best.latency_us + else: + # Default to latency + return new_data.latency_us < current_best.latency_us + + def optimize_configurations(self, data: List[PerformanceData]) -> List[str]: + """Find optimal configurations and return as NCCL config strings""" + # Group data by configuration key and size range + grouped_data = defaultdict(lambda: defaultdict(list)) + + for item in data: + config_key = item.get_config_key() + size_range = item.get_size_range_key(self.topology_size_ranges) + grouped_data[config_key][size_range].append(item) + + # Store optimal configurations before combining ranges + optimal_configs = [] + + for config_key, size_ranges_dict in grouped_data.items(): + collective, nodes, ranks, pipeOps, regBuff = config_key + + for (min_size, max_size), items in size_ranges_dict.items(): + if not items: + continue + + # Find the best performing configuration for this size range + best_item = items[0] + for item in items[1:]: + if self.is_better(item, best_item): + best_item = item + + # Store the optimal configuration with its range + optimal_configs.append({ + 'collective': collective, + 'min_size': min_size, + 'max_size': max_size, + 'algorithm': best_item.algorithm, + 'protocol': best_item.protocol, + 'channels': best_item.channels, + 'nodes': best_item.nodes, + 'ranks': best_item.ranks, + 'pipeOps': best_item.pipeOps, + 'regBuff': best_item.regBuff, + 'metric_value': getattr(best_item, self.optimization_metric) + }) + + # Combine sequential ranges with identical tunings + combined_configs = self.combine_sequential_ranges(optimal_configs) + + # Generate config strings + configs = [] + for config in combined_configs: + config_str = f"{config['collective']},{config['min_size']},{config['max_size']},{config['algorithm']},{config['protocol']},{config['channels']},{config['nodes']},{config['ranks']},{config['pipeOps']},{config['regBuff']}" + configs.append(config_str) + + print(f"Optimal for {config['collective']} [{config['min_size']}-{config['max_size']}] nodes={config['nodes']} ranks={config['ranks']}: " + f"{config['algorithm']}/{config['protocol']} channels={config['channels']} " + f"({self.optimization_metric}={config['metric_value']:.3f})") + + return configs + + def combine_sequential_ranges(self, configs: List[Dict]) -> List[Dict]: + """Combine sequential ranges that have identical tuning parameters""" + if not configs: + return configs + + # Group by collective and topology (nodes, ranks) + topology_groups = defaultdict(list) + for config in configs: + topology_key = (config['collective'], config['nodes'], config['ranks'], + config['pipeOps'], config['regBuff']) + topology_groups[topology_key].append(config) + + combined_configs = [] + + for topology_key, topology_configs in topology_groups.items(): + # Sort by min_size to ensure proper ordering + topology_configs.sort(key=lambda x: x['min_size']) + + # Group by tuning parameters (algorithm, protocol, channels) + tuning_groups = defaultdict(list) + for config in topology_configs: + tuning_key = (config['algorithm'], config['protocol'], config['channels']) + tuning_groups[tuning_key].append(config) + + # For each tuning group, combine sequential ranges + for tuning_key, tuning_configs in tuning_groups.items(): + if not tuning_configs: + continue + + # Sort by min_size + tuning_configs.sort(key=lambda x: x['min_size']) + + # Combine sequential ranges + current_config = tuning_configs[0].copy() + + for next_config in tuning_configs[1:]: + # Check if ranges are adjacent or overlapping + if current_config['max_size'] + 1 >= next_config['min_size']: + # Extend the current range + current_config['max_size'] = max(current_config['max_size'], next_config['max_size']) + # Update metric value to the better one + if self.optimization_metric == 'bandwidth_gbps': + if next_config['metric_value'] > current_config['metric_value']: + current_config['metric_value'] = next_config['metric_value'] + else: # latency_us or default + if next_config['metric_value'] < current_config['metric_value']: + current_config['metric_value'] = next_config['metric_value'] + else: + # Gap between ranges, save current and start new one + combined_configs.append(current_config) + current_config = next_config.copy() + + # Add the last configuration + combined_configs.append(current_config) + + # Sort final configs by collective, nodes, ranks, then min_size + combined_configs.sort(key=lambda x: (x['collective'], x['nodes'], x['ranks'], x['min_size'])) + + original_count = len(configs) + combined_count = len(combined_configs) + if combined_count < original_count: + print(f"Combined {original_count} ranges into {combined_count} ranges " + f"(reduced by {original_count - combined_count})") + + return combined_configs + + def append_to_config_file(self, configs: List[str], config_file: str, add_header: bool = True): + """Append optimized configurations to NCCL tuner config file""" + try: + # Create directory if it doesn't exist + config_dir = os.path.dirname(config_file) + if config_dir and not os.path.exists(config_dir): + os.makedirs(config_dir) + print(f"Created directory: {config_dir}") + + # Check if file exists and has content + file_exists = os.path.exists(config_file) + add_separator = False + + if file_exists: + with open(config_file, 'r') as f: + content = f.read().strip() + add_separator = len(content) > 0 + print(f"Appending to existing file: {config_file}") + else: + print(f"Creating new file: {config_file}") + + with open(config_file, 'a') as f: + if add_separator: + f.write("\n\n") + + if add_header: + f.write(f"# Optimized configurations generated by optimize_config.py\n") + f.write(f"# Optimization metric: {self.optimization_metric}\n") + f.write(f"# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff\n") + + for config in configs: + f.write(f"{config}\n") + + if file_exists: + print(f"Appended {len(configs)} optimized configurations to {config_file}") + else: + print(f"Created {config_file} with {len(configs)} optimized configurations") + + except PermissionError: + print(f"Error: Permission denied writing to {config_file}") + print("Try running with appropriate permissions or choose a different output location") + sys.exit(1) + except OSError as e: + print(f"Error: Cannot create/write to {config_file}: {e}") + print("Check that the path is valid and you have write permissions") + sys.exit(1) + except Exception as e: + print(f"Unexpected error writing to {config_file}: {e}") + sys.exit(1) + +def main(): + parser = argparse.ArgumentParser(description="Optimize NCCL tuner configurations from performance data") + parser.add_argument("csv_file", help="Input CSV file with performance data") + parser.add_argument("-o", "--output", default="nccl_tuner.conf", + help="Output NCCL tuner config file (default: nccl_tuner.conf)") + parser.add_argument("-m", "--metric", choices=['bandwidth_gbps', 'latency_us'], + default='latency_us', help="Optimization metric (default: latency_us)") + parser.add_argument("--no-header", action="https://github.com/store_true", + help="Don't add header comments to output file") + parser.add_argument("--dry-run", action="https://github.com/store_true", + help="Print configurations without writing to file") + parser.add_argument("--no-auto-ranges", action="https://github.com/store_true", + help="Disable automatic size range determination (use default ranges)") + parser.add_argument("--size-ranges", type=str, + help="Custom size ranges as comma-separated pairs: 'min1-max1,min2-max2,...'") + + args = parser.parse_args() + + optimizer = ConfigOptimizer(args.metric) + + # Handle size range configuration + if args.size_ranges: + # Parse custom size ranges + try: + ranges = [] + for range_str in args.size_ranges.split(','): + min_size, max_size = map(int, range_str.split('-')) + ranges.append((min_size, max_size)) + optimizer.set_size_ranges(ranges) + print(f"Using custom size ranges: {ranges}") + except ValueError: + print("Error: Invalid size ranges format. Use 'min1-max1,min2-max2,...'") + sys.exit(1) + elif args.no_auto_ranges: + # Disable auto-ranging + optimizer.auto_size_ranges = False + print("Using default hardcoded size ranges") + else: + # Auto-ranging is enabled by default - creates one bucket per unique size + optimizer.auto_size_ranges = True + print("Auto-ranging enabled: will create one bucket per unique size in data") + + # Load and optimize data + data = optimizer.load_data(args.csv_file) + if not data: + print("No valid data found in CSV file") + sys.exit(1) + + configs = optimizer.optimize_configurations(data) + + if args.dry_run: + print("\nGenerated configurations:") + for config in configs: + print(config) + else: + optimizer.append_to_config_file(configs, args.output, not args.no_header) + +if __name__ == "__main__": + main() diff --git a/ext-tuner/example/scripts/sample_performance_data.csv b/ext-tuner/example/scripts/sample_performance_data.csv new file mode 100644 index 000000000..7b96403c0 --- /dev/null +++ b/ext-tuner/example/scripts/sample_performance_data.csv @@ -0,0 +1,24 @@ +collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,cost_metric,bandwidth_gbps,latency_us +allreduce,1024,tree,simple,2,1,8,-1,-1,0.15,45.2,12.5 +allreduce,1024,ring,simple,4,1,8,-1,-1,0.12,52.1,10.8 +allreduce,1024,tree,ll,2,1,8,-1,-1,0.18,41.3,15.2 +allreduce,1024,ring,ll,4,1,8,-1,-1,0.14,48.7,12.1 +allreduce,32768,tree,simple,2,1,8,-1,-1,0.25,156.8,25.3 +allreduce,32768,ring,simple,4,1,8,-1,-1,0.18,189.2,18.4 +allreduce,32768,ring,ll128,8,1,8,-1,-1,0.16,201.5,16.2 +allreduce,1048576,ring,simple,4,1,8,-1,-1,0.45,425.6,45.1 +allreduce,1048576,ring,ll128,8,1,8,-1,-1,0.38,482.3,38.7 +allreduce,1048576,nvls,simple,16,1,8,-1,-1,0.32,551.2,32.1 +broadcast,1024,tree,simple,2,1,8,-1,-1,0.08,89.4,8.2 +broadcast,1024,ring,simple,4,1,8,-1,-1,0.12,71.3,12.1 +broadcast,32768,tree,simple,2,1,8,-1,-1,0.18,234.7,18.5 +broadcast,32768,ring,ll128,4,1,8,-1,-1,0.15,267.8,15.2 +broadcast,1048576,ring,simple,4,1,8,-1,-1,0.35,612.4,35.1 +broadcast,1048576,ring,ll128,8,1,8,-1,-1,0.28,702.1,28.3 +allreduce,1024,tree,simple,2,2,16,-1,-1,0.22,38.1,22.4 +allreduce,1024,ring,simple,4,2,16,-1,-1,0.19,42.7,19.6 +allreduce,32768,ring,simple,4,2,16,-1,-1,0.28,145.2,28.1 +allreduce,32768,ring,ll128,8,2,16,-1,-1,0.24,167.8,24.3 +allreduce,1048576,ring,simple,4,2,16,-1,-1,0.58,387.5,58.2 +allreduce,1048576,ring,ll128,8,2,16,-1,-1,0.48,456.9,48.1 +allreduce,1048576,nvls,simple,16,2,16,-1,-1,0.42,512.6,42.3 diff --git a/ext-tuner/example/test/Makefile b/ext-tuner/example/test/Makefile new file mode 100644 index 000000000..d675cbe1e --- /dev/null +++ b/ext-tuner/example/test/Makefile @@ -0,0 +1,30 @@ +# +# Makefile for NCCL Tuner Plugin Unit Tests +# + +CC := gcc +CFLAGS := -Wall -Wextra -g -std=c99 -fPIC +INC := -I. -I../nccl +TARGET := test_plugin +SOURCES := test_plugin.c + +# Default target +all: $(TARGET) + +# Build the test executable +$(TARGET): $(SOURCES) + $(CC) $(CFLAGS) $(INC) -o $(TARGET) $(SOURCES) + +# Run the tests +test: $(TARGET) + ./$(TARGET) $(TEST_CASE) + +# Run tests with verbose output +test-verbose: $(TARGET) + NCCL_DEBUG=INFO ./$(TARGET) $(TEST_CASE) + +# Clean build artifacts +clean: + rm -f $(TARGET) *.o *.gcov *.gcda *.gcno test_*.conf + +.PHONY: all test test-verbose clean diff --git a/ext-tuner/example/test/README.md b/ext-tuner/example/test/README.md new file mode 100644 index 000000000..8203c65a1 --- /dev/null +++ b/ext-tuner/example/test/README.md @@ -0,0 +1,205 @@ +# NCCL Tuner Plugin Unit Tests + +This directory contains comprehensive unit tests for the NCCL tuner plugin. The tests verify all major functionality including configuration parsing, matching logic, and cost table updates. + +## Test Structure + +``` +test/ +├── test_plugin.c # Main unit test file +├── Makefile # Build system for tests +└── README.md # This file +``` + +## Building and Running Tests + +### Quick Start + +```bash +# Build and run all tests +make test + +# Or step by step +make # Build test executable +./test_plugin # Run tests +``` + +### Advanced Testing + +```bash +# Run with memory leak detection (requires valgrind) +make test-memory + +# Run with verbose logging +make test-verbose + +# Generate code coverage report (requires gcov) +make coverage + +# Create sample test configuration files +make test-configs +``` + +## Test Coverage + +The unit tests cover the following functionality: + +### 1. **Plugin Initialization (`test_plugin_init`)** +- Tests successful plugin initialization +- Verifies context allocation +- Tests cleanup on destroy + +### 2. **Configuration Parsing (`test_config_parsing_valid`, `test_config_parsing_invalid`)** +- Valid CSV format parsing +- Comment and empty line handling +- Invalid format graceful handling +- Environment variable configuration + +### 3. **Collective Type Matching (`test_collective_matching`)** +- Correct matching of allreduce, broadcast, etc. +- Algorithm/protocol selection +- Channel configuration + +### 4. **Size Range Matching (`test_size_matching`)** +- Small, medium, large message size handling +- Proper range boundary checking +- Multiple size-based configurations + +### 5. **Topology Matching (`test_topology_matching`)** +- Single-node vs multi-node configurations +- Exact nNodes/nRanks matching +- Wildcard matching (-1 values) + +### 6. **Default Channels (`test_default_channels`)** +- Proper handling of -1 channel specification +- Preservation of NCCL default behavior + +### 7. **Registered Buffer Matching (`test_regbuff_matching`)** +- Configurations based on regBuff parameter +- Registered vs non-registered buffer handling +- Backward compatibility with configs missing regBuff + +### 8. **Pipeline Operations Matching (`test_pipeops_matching`)** +- Configurations based on numPipeOps parameter +- Single vs multiple pipeline operation handling +- Backward compatibility with configs missing numPipeOps + +### 9. **Fallback Behavior (`test_no_match_fallback`)** +- Default behavior when no config matches +- Ring/Simple algorithm fallback + +## Test Output + +Successful test run: +``` +Running NCCL Tuner Plugin Unit Tests +===================================== +PASS: test_plugin_init +PASS: test_config_parsing_valid +PASS: test_config_parsing_invalid +PASS: test_collective_matching +PASS: test_size_matching +PASS: test_topology_matching +PASS: test_default_channels +PASS: test_regbuff_matching +PASS: test_pipeops_matching +PASS: test_no_match_fallback + +===================================== +Test Results: 9/9 tests passed +All tests PASSED! +``` + +Failed test example: +``` +FAIL: test_collective_matching - Tree/Simple should have low cost +Test Results: 8/9 tests passed +Some tests FAILED! +``` + +## Mock NCCL Implementation + +The tests use the actual NCCL header files from the `../nccl/` directory: + +- `tuner.h` - Complete NCCL tuner interface and type definitions +- `common.h` - Common NCCL types and logging functions +- `err.h` - NCCL error codes + +This allows testing with the real NCCL interface definitions while still being able to run tests without the full NCCL library installation. + +## Integration with CI/CD + +```bash +# Install tests for CI/CD pipeline +make install-test + +# Run as part of automated testing +make test && echo "Tests passed" || echo "Tests failed" +``` + +## Memory Testing + +The tests can be run with valgrind for memory leak detection: + +```bash +make test-memory +``` + +This will detect: +- Memory leaks +- Invalid memory access +- Use of uninitialized memory + +## Code Coverage + +Generate code coverage reports to ensure comprehensive testing: + +```bash +make coverage +# Creates test_plugin.c.gcov with line-by-line coverage +``` + +## Adding New Tests + +To add a new test: + +1. Create a new test function in `test_plugin.c`: +```c +int test_new_feature() { + // Test setup + TEST_ASSERT(condition, "description"); + // Test cleanup + TEST_PASS(); +} +``` + +2. Add the test to the main function: +```c +total++; passed += test_new_feature(); +``` + +3. Rebuild and run: +```bash +make test +``` + +## Debugging Tests + +For debugging failed tests: + +```bash +# Compile with debug symbols +make CFLAGS="-g -O0 -DDEBUG" + +# Run with gdb +gdb ./test_plugin +``` + +## Cleaning Up + +```bash +# Remove all build artifacts and temporary files +make clean +``` + +This comprehensive test suite ensures the NCCL tuner plugin works correctly across all supported configurations and edge cases. diff --git a/ext-tuner/example/test/test_plugin.c b/ext-tuner/example/test/test_plugin.c new file mode 100644 index 000000000..28897c449 --- /dev/null +++ b/ext-tuner/example/test/test_plugin.c @@ -0,0 +1,856 @@ +/************************************************************************* + * Unit tests for NCCL Tuner Plugin + ************************************************************************/ + +#define _GNU_SOURCE // Enable setenv/unsetenv and other GNU extensions + +#include +#include +#include +#include +#include +#include +#include + + +// Include NCCL tuner header (which includes common.h and err.h) +#include "tuner.h" + +// Include plugin source for testing +#include "../plugin.c" + +// Test framework macros +#define TEST_ASSERT(condition, message) \ + do { \ + if (!(condition)) { \ + printf("FAIL: %s - %s\n", __func__, message); \ + return 0; \ + } \ + } while(0) + +#define TEST_PASS() \ + do { \ + printf("PASS: %s\n", __func__); \ + return 1; \ + } while(0) + +// Global test state +static int test_log_count = 0; + +// Mock logger function +void mock_logger(ncclDebugLogLevel level, unsigned long flags, + const char* file, int line, const char* fmt, ...) { + (void)flags; // Suppress unused parameter warning + test_log_count++; + + // Check if we should print based on NCCL_DEBUG level + const char* debug_level = getenv("NCCL_DEBUG"); + int should_print = 0; + + if (debug_level) { + if (strcmp(debug_level, "TRACE") == 0) { + should_print = 1; // Print everything + } else if (strcmp(debug_level, "INFO") == 0 && level <= NCCL_LOG_INFO) { + should_print = 1; // Print INFO and below + } else if (strcmp(debug_level, "WARN") == 0 && level <= NCCL_LOG_WARN) { + should_print = 1; // Print WARN and below + } + } + + if (!should_print) return; + + // Convert log level to string + const char* level_str; + switch(level) { + case NCCL_LOG_NONE: level_str = "NONE"; break; + case NCCL_LOG_VERSION: level_str = "VERSION"; break; + case NCCL_LOG_WARN: level_str = "WARN"; break; + case NCCL_LOG_INFO: level_str = "INFO"; break; + case NCCL_LOG_ABORT: level_str = "ABORT"; break; + case NCCL_LOG_TRACE: level_str = "TRACE"; break; + default: level_str = "UNKNOWN"; break; + } + + // Print log header + printf("[TUNER:%s:%s:%d] ", level_str, file, line); + + // Print formatted message + va_list args; + va_start(args, fmt); + vprintf(fmt, args); + va_end(args); + + printf("\n"); +} + +// Helper function to create test config file +void create_test_config(const char* filename, const char* content) { + FILE* f = fopen(filename, "w"); + if (f) { + fprintf(f, "%s", content); + fclose(f); + } +} + +// Test 1: Plugin initialization +int test_plugin_init() { + void* context = NULL; + + // Test successful initialization + ncclResult_t result = pluginInit(8, 2, mock_logger, &context); + TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed"); + TEST_ASSERT(context != NULL, "Context should be allocated"); + + // Clean up + pluginDestroy(context); + TEST_PASS(); +} + +// Test 2: Configuration file parsing - valid CSV +int test_config_parsing_valid() { + const char* test_config = + "# Test configuration\n" + "allreduce,0,65536,tree,simple,2,1,-1,-1,-1\n" + "broadcast,0,32768,ring,ll128,4,2,16,-1,-1\n" + "# Comment line\n" + "\n" // Empty line + "reduce,1024,2048,tree,simple,-1,-1,-1,-1,-1\n"; + + create_test_config("test_valid.conf", test_config); + + // Set environment variable to use our test config + setenv("NCCL_TUNER_CONFIG_FILE", "test_valid.conf", 1); + + void* context = NULL; + ncclResult_t result = pluginInit(16, 2, mock_logger, &context); + TEST_ASSERT(result == ncclSuccess, "Plugin init with valid config should succeed"); + + // Clean up + pluginDestroy(context); + unlink("test_valid.conf"); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + TEST_PASS(); +} + +// Test 3: Configuration file parsing - invalid CSV +int test_config_parsing_invalid() { + const char* test_config = + "allreduce,0,65536,tree,simple,2,1 # Missing nRanks and other fields\n" + "invalid_collective,0,1024,ring,simple,1,1,1,-1,-1\n" + "broadcast,abc,def,ring,simple,1,1,1,-1,-1\n"; // Invalid numbers + + create_test_config("test_invalid.conf", test_config); + setenv("NCCL_TUNER_CONFIG_FILE", "test_invalid.conf", 1); + + void* context = NULL; + ncclResult_t result = pluginInit(8, 1, mock_logger, &context); + // Should still succeed but with no valid configs loaded + TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed even with invalid config"); + + // Clean up + pluginDestroy(context); + unlink("test_invalid.conf"); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + TEST_PASS(); +} + +// Test 4: Collective type matching +int test_collective_matching() { + const char* test_config = + "allreduce,0,65536,tree,simple,8,1,-1,-1,-1\n" + "broadcast,0,32768,ring,ll128,4,-1,-1,-1,-1\n"; + + create_test_config("test_match.conf", test_config); + setenv("NCCL_TUNER_CONFIG_FILE", "test_match.conf", 1); + + void* context = NULL; + pluginInit(8, 1, mock_logger, &context); + + // Create mock cost table + float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + cost_table_ptr[i] = cost_table[i]; + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; // Default high cost + } + } + + int nChannels; + + // Test allreduce matching (should match first config) + ncclResult_t result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + + TEST_ASSERT(result == ncclSuccess, "GetCollInfo should succeed"); + mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__, + "DEBUG: Checking cost_table[TREE][SIMPLE] (%p) = %.1f (expecting 0.0)", + &cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE]); + TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Tree/Simple should have low cost"); + TEST_ASSERT(nChannels == 8, "Should set 8 channels"); + + // Test broadcast matching (should match second config) + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; // Reset costs + } + } + + result = pluginGetCollInfo(context, ncclFuncBroadcast, 16384, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + TEST_ASSERT(result == ncclSuccess, "GetCollInfo should succeed"); + mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__, + "DEBUG: Checking cost_table[RING][LL128] (%p) = %.1f (expecting 0.0)", + &cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128], cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128]); + TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Ring/LL128 should have low cost"); + TEST_ASSERT(nChannels == 4, "Should set 4 channels"); + + // Clean up + pluginDestroy(context); + unlink("test_match.conf"); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + TEST_PASS(); +} + +// Test 5: Size range matching +int test_size_matching() { + const char* test_config = + "allreduce,0,1024,tree,simple,2,-1,-1,-1,-1\n" + "allreduce,1025,65536,ring,simple,4,-1,-1,-1,-1\n" + "allreduce,65537,4294967295,ring,ll128,8,-1,-1,-1,-1\n"; + + create_test_config("test_size.conf", test_config); + setenv("NCCL_TUNER_CONFIG_FILE", "test_size.conf", 1); + + void* context = NULL; + pluginInit(8, 1, mock_logger, &context); + + float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + cost_table_ptr[i] = cost_table[i]; + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + int nChannels = 1; + + pluginGetCollInfo(context, ncclFuncAllReduce, 512, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__, + "DEBUG: Small message - checking cost_table[TREE][SIMPLE] (%p) = %.1f (expecting 0.0)", + &cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE]); + TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Small: Tree/Simple should have low cost"); + TEST_ASSERT(nChannels == 2, "Small: Should set 2 channels"); + + // Test medium message (should match second config) + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__, + "DEBUG: Medium message - checking cost_table[RING][SIMPLE] (%p) = %.1f (expecting 0.0)", + &cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]); + TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Medium: Ring/Simple should have low cost"); + TEST_ASSERT(nChannels == 4, "Medium: Should set 4 channels"); + + // Test large message (should match third config) + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + pluginGetCollInfo(context, ncclFuncAllReduce, 1048576, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__, + "DEBUG: Large message - checking cost_table[RING][LL128] (%p) = %.1f (expecting 0.0)", + &cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128], cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128]); + TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Large: Ring/LL128 should have low cost"); + TEST_ASSERT(nChannels == 8, "Large: Should set 8 channels"); + + // Clean up + pluginDestroy(context); + unlink("test_size.conf"); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + TEST_PASS(); +} + +// Test 6: Topology matching +int test_topology_matching() { + const char* test_config = + "allreduce,0,65536,tree,simple,2,1,-1,-1,-1\n" // Single node only + "allreduce,0,65536,ring,simple,4,4,32,-1,-1\n" // 4 nodes, 32 ranks exactly + "allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n"; // Any topology + + create_test_config("test_topo.conf", test_config); + setenv("NCCL_TUNER_CONFIG_FILE", "test_topo.conf", 1); + + // Test with single node setup + void* context1 = NULL; + pluginInit(8, 1, mock_logger, &context1); // 8 ranks, 1 node + + float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + cost_table_ptr[i] = cost_table[i]; + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + int nChannels; + pluginGetCollInfo(context1, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single node: Should match tree config"); + TEST_ASSERT(nChannels == 2, "Single node: Should set 2 channels"); + + pluginDestroy(context1); + + // Test with 4 nodes, 32 ranks setup + void* context2 = NULL; + pluginInit(32, 4, mock_logger, &context2); // 32 ranks, 4 nodes + + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + pluginGetCollInfo(context2, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "4-node: Should match ring/simple config"); + TEST_ASSERT(nChannels == 4, "4-node: Should set 4 channels"); + + // Clean up + unlink("test_topo.conf"); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + TEST_PASS(); +} + +// Test 7: Default channels behavior (-1) +int test_default_channels() { + const char* test_config = + "allreduce,0,65536,tree,simple,-1,-1,-1,-1,-1\n"; // Use default channels + + create_test_config("test_default.conf", test_config); + setenv("NCCL_TUNER_CONFIG_FILE", "test_default.conf", 1); + + void* context = NULL; + pluginInit(8, 1, mock_logger, &context); + + float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + cost_table_ptr[i] = cost_table[i]; + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + int nChannels = 99; // Set to known value + pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + + TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Should apply algorithm/protocol"); + TEST_ASSERT(nChannels == 1, "Should keep default channels (1) when config has -1"); + + // Clean up + pluginDestroy(context); + unlink("test_default.conf"); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + TEST_PASS(); +} + +// Test 8: regBuff matching +int test_regbuff_matching() { + const char* test_config = + "allreduce,0,65536,tree,simple,2,-1,-1,-1,1\n" // Registered buffers only + "allreduce,0,65536,ring,simple,4,-1,-1,-1,0\n" // Non-registered buffers only + "allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n"; // Any buffer type (backward compatible) + + create_test_config("test_regbuff.conf", test_config); + setenv("NCCL_TUNER_CONFIG_FILE", "test_regbuff.conf", 1); + + void* context = NULL; + pluginInit(8, 1, mock_logger, &context); + + float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + cost_table_ptr[i] = cost_table[i]; + } + + int nChannels; + + // Test registered buffer (should match first config) + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 1, &nChannels); // regBuff = 1 (registered) + TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Registered buffer: Tree/Simple should have low cost"); + TEST_ASSERT(nChannels == 2, "Registered buffer: Should set 2 channels"); + + // Test non-registered buffer (should match second config) + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); // regBuff = 0 (non-registered) + TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Non-registered buffer: Ring/Simple should have low cost"); + TEST_ASSERT(nChannels == 4, "Non-registered buffer: Should set 4 channels"); + + // Test backward compatibility - config without regBuff should match any regBuff value + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + // First try with regBuff=2 (unusual value, should match third config) + pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 2, &nChannels); // regBuff = 2 (only third config should match) + TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Any regBuff: Ring/LL128 should have low cost"); + TEST_ASSERT(nChannels == 8, "Any regBuff: Should set 8 channels"); + + // Clean up + pluginDestroy(context); + unlink("test_regbuff.conf"); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + TEST_PASS(); +} + +// Test 9: numPipeOps matching +int test_pipeops_matching() { + const char* test_config = + "allreduce,0,65536,tree,simple,2,-1,-1,1,-1\n" // Single pipeline op + "allreduce,0,65536,ring,simple,4,-1,-1,4,-1\n" // Multiple pipeline ops + "allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n"; // Any pipeline ops (backward compatible) + + create_test_config("test_pipeops.conf", test_config); + setenv("NCCL_TUNER_CONFIG_FILE", "test_pipeops.conf", 1); + + void* context = NULL; + pluginInit(8, 1, mock_logger, &context); + + float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + cost_table_ptr[i] = cost_table[i]; + } + + int nChannels; + + // Test single pipeline op (should match first config) + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single pipeOp: Tree/Simple should have low cost"); + TEST_ASSERT(nChannels == 2, "Single pipeOp: Should set 2 channels"); + + // Test multiple pipeline ops (should match second config) + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 4, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Multiple pipeOps: Ring/Simple should have low cost"); + TEST_ASSERT(nChannels == 4, "Multiple pipeOps: Should set 4 channels"); + + // Test different number of pipeline ops (should match third config - backward compatible) + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 2, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Any pipeOps: Ring/LL128 should have low cost"); + TEST_ASSERT(nChannels == 8, "Any pipeOps: Should set 8 channels"); + + // Clean up + pluginDestroy(context); + unlink("test_pipeops.conf"); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + TEST_PASS(); +} + +// Test 10: No matching configuration (fallback behavior) +int test_no_match_fallback() { + const char* test_config = + "broadcast,0,1024,tree,simple,2,-1,-1,-1,-1\n"; // Only broadcast config + + create_test_config("test_fallback.conf", test_config); + setenv("NCCL_TUNER_CONFIG_FILE", "test_fallback.conf", 1); + + void* context = NULL; + pluginInit(8, 1, mock_logger, &context); + + float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + cost_table_ptr[i] = cost_table[i]; + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + int nChannels; + // Try allreduce (should not match, use fallback) + pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + + mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__, + "DEBUG: Fallback test - checking cost_table[RING][SIMPLE] (%p) = %.1f (expecting 0.0)", + &cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]); + TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 1.0, "Should use pass through unmodified"); + TEST_ASSERT(nChannels == 1, "Should use default channels"); + + // Clean up + pluginDestroy(context); + unlink("test_fallback.conf"); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + TEST_PASS(); +} + +// Test 11: Large configuration files (testing dynamic allocation) +int test_large_config() { + const char* large_config_file = "test_large.conf"; + + // Create a large configuration file with many entries + // This tests the dynamic allocation functionality + FILE* f = fopen(large_config_file, "w"); + TEST_ASSERT(f != NULL, "Should be able to create large config file"); + + // Write header comment + fprintf(f, "# Large configuration file for testing dynamic allocation\n"); + fprintf(f, "# This file contains many configurations to test memory allocation\n"); + + // Generate a large number of configurations (much more than the old MAX_CONFIGS=100) + const int num_configs = 500; // 5x the old static limit + const char* collectives[] = {"allreduce", "broadcast", "reduce", "allgather", "reducescatter"}; + const char* algorithms[] = {"tree", "ring", "collnet_direct", "nvls"}; + const char* protocols[] = {"simple", "ll", "ll128"}; + + for (int i = 0; i < num_configs; i++) { + // Vary the configurations to create realistic test data + const char* coll = collectives[i % 5]; + const char* algo = algorithms[i % 4]; + const char* proto = protocols[i % 3]; + + size_t min_bytes = (i * 1024) % 1048576; // Vary from 0 to 1MB + size_t max_bytes = min_bytes + 65536; // 64KB range + int channels = (i % 8) + 1; // 1-8 channels + int nodes = (i % 4) == 0 ? -1 : (i % 4); // Mix of -1 and 1-3 nodes + int ranks = (i % 8) == 0 ? -1 : (i % 32) + 1; // Mix of -1 and 1-32 ranks + int pipeOps = (i % 3) == 0 ? -1 : (i % 4) + 1; // Mix of -1 and 1-4 pipeOps + int regBuff = (i % 3) == 0 ? -1 : (i % 2); // Mix of -1, 0, 1 + + fprintf(f, "%s,%zu,%zu,%s,%s,%d,%d,%d,%d,%d\n", + coll, min_bytes, max_bytes, algo, proto, channels, nodes, ranks, pipeOps, regBuff); + } + + fclose(f); + + // Set environment to use our large config file + setenv("NCCL_TUNER_CONFIG_FILE", large_config_file, 1); + + // Initialize plugin with large config + void* context = NULL; + ncclResult_t result = pluginInit(16, 4, mock_logger, &context); + TEST_ASSERT(result == ncclSuccess, "Plugin init with large config should succeed"); + TEST_ASSERT(context != NULL, "Context should be allocated"); + + // Verify that configurations were loaded + TunerContext* ctx = (TunerContext*)context; + TEST_ASSERT(ctx->numConfigs == num_configs, "Should load all configurations from large file"); + TEST_ASSERT(ctx->maxConfigs == num_configs, "maxConfigs should match allocated size"); + TEST_ASSERT(ctx->configs != NULL, "Configs array should be dynamically allocated"); + + // Test that we can access configurations throughout the array + // (This would have failed with the old static MAX_CONFIGS=100 limit) + for (int i = 0; i < ctx->numConfigs; i++) { + TuningConfig* config = &ctx->configs[i]; + // Basic sanity checks on the loaded configurations + TEST_ASSERT(config->collType >= ncclFuncBroadcast && config->collType <= ncclFuncAllReduce, + "Collective type should be valid"); + TEST_ASSERT(config->maxBytes >= config->minBytes, "maxBytes should be >= minBytes"); + TEST_ASSERT(config->nChannels > 0, "nChannels should be positive"); + } + + // Test specific configuration access at various indices + // Index 0 (first config) + TuningConfig* first_config = &ctx->configs[0]; + TEST_ASSERT(first_config != NULL, "First config should be accessible"); + + // Index in middle + TuningConfig* mid_config = &ctx->configs[num_configs / 2]; + TEST_ASSERT(mid_config != NULL, "Middle config should be accessible"); + + // Index near end (this would have crashed with static array of 100) + TuningConfig* late_config = &ctx->configs[num_configs - 1]; + TEST_ASSERT(late_config != NULL, "Last config should be accessible"); + + // Test memory allocation size - verify we didn't over-allocate + mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__, + "Successfully loaded %d configurations (dynamic allocation)", ctx->numConfigs); + mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__, + "Memory allocated for %d configurations (%zu bytes total)", + ctx->maxConfigs, ctx->maxConfigs * sizeof(TuningConfig)); + + // Test that the plugin can still find matching configurations from the large set + float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + cost_table_ptr[i] = cost_table[i]; + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; // Default high cost + } + } + + int nChannels; + // Try to find a matching configuration - should work with large config set + result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with large config set"); + + // Clean up + pluginDestroy(context); + unlink(large_config_file); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + + TEST_PASS(); +} + +// Test 12: Very large configuration stress test +int test_very_large_config_stress() { + const char* stress_config_file = "test_stress.conf"; + + // Create an even larger configuration file to stress test the implementation + FILE* f = fopen(stress_config_file, "w"); + TEST_ASSERT(f != NULL, "Should be able to create stress test config file"); + + fprintf(f, "# Stress test configuration with very large number of entries\n"); + + // Generate an extremely large number of configurations + const int stress_configs = 2000; // 20x the old static limit + + for (int i = 0; i < stress_configs; i++) { + // Create varied but valid configurations + fprintf(f, "allreduce,%d,%d,ring,simple,4,-1,-1,-1,-1\n", + i * 512, (i * 512) + 1024); + } + + fclose(f); + + setenv("NCCL_TUNER_CONFIG_FILE", stress_config_file, 1); + + // Test initialization with stress config + void* context = NULL; + ncclResult_t result = pluginInit(8, 2, mock_logger, &context); + TEST_ASSERT(result == ncclSuccess, "Plugin should handle very large config files"); + + TunerContext* ctx = (TunerContext*)context; + TEST_ASSERT(ctx->numConfigs == stress_configs, "Should load all stress test configurations"); + TEST_ASSERT(ctx->configs != NULL, "Stress test configs should be allocated"); + + mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__, + "Stress test - loaded %d configurations successfully", stress_configs); + mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__, + "Memory usage: %zu bytes for configuration array", + stress_configs * sizeof(TuningConfig)); + + // Verify we can access configurations throughout the entire range + for (int i = 0; i < stress_configs; i += 100) { // Sample every 100th config + TuningConfig* config = &ctx->configs[i]; + TEST_ASSERT(config->collType == ncclFuncAllReduce, "Config should have correct collective type"); + TEST_ASSERT(config->minBytes == (size_t)(i * 512), "Config should have correct minBytes"); + } + + // Clean up + pluginDestroy(context); + unlink(stress_config_file); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + + TEST_PASS(); +} + +// Test 13: Edge case - empty config file +int test_empty_config() { + const char* empty_config_file = "test_empty.conf"; + + // Create empty config file (only comments) + create_test_config(empty_config_file, + "# Empty configuration file\n" + "# No actual configurations\n" + "\n" + "\n"); + + setenv("NCCL_TUNER_CONFIG_FILE", empty_config_file, 1); + + void* context = NULL; + ncclResult_t result = pluginInit(8, 2, mock_logger, &context); + TEST_ASSERT(result == ncclSuccess, "Plugin should handle empty config files"); + + TunerContext* ctx = (TunerContext*)context; + TEST_ASSERT(ctx->numConfigs == 0, "Should have zero configurations"); + TEST_ASSERT(ctx->maxConfigs == 0, "Should have zero max configurations"); + TEST_ASSERT(ctx->configs == NULL, "Should not allocate memory for empty config"); + + // Test that plugin still works with no configurations (fallback behavior) + float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; + for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { + cost_table_ptr[i] = cost_table[i]; + for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { + cost_table[i][j] = 1.0; + } + } + + int nChannels; + result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1, + cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS, + 0, &nChannels); + TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with empty config"); + + // Clean up + pluginDestroy(context); + unlink(empty_config_file); + unsetenv("NCCL_TUNER_CONFIG_FILE"); + + TEST_PASS(); +} + +// Test runner function pointer type +typedef int (*TestFunction)(void); + +// Test registry +typedef struct { + const char* name; + TestFunction func; + const char* description; +} TestCase; + +// All available tests +TestCase test_cases[] = { + {"init", test_plugin_init, "Plugin initialization"}, + {"config-valid", test_config_parsing_valid, "Valid configuration parsing"}, + {"config-invalid", test_config_parsing_invalid, "Invalid configuration parsing"}, + {"collective", test_collective_matching, "Collective type matching"}, + {"size", test_size_matching, "Size range matching"}, + {"topology", test_topology_matching, "Topology matching"}, + {"channels", test_default_channels, "Default channels behavior"}, + {"regbuff", test_regbuff_matching, "Registered buffer matching"}, + {"pipeops", test_pipeops_matching, "Pipeline operations matching"}, + {"fallback", test_no_match_fallback, "Fallback behavior"}, + {"large-config", test_large_config, "Large configuration files (dynamic allocation)"}, + {"stress-config", test_very_large_config_stress, "Very large configuration stress test"}, + {"empty-config", test_empty_config, "Empty configuration file handling"}, + {NULL, NULL, NULL} // End marker +}; + +// Show help/usage information +void show_help(const char* program_name) { + printf("Usage: %s [test_name ...]\n\n", program_name); + printf("Available tests:\n"); + for (int i = 0; test_cases[i].name != NULL; i++) { + printf(" %-15s - %s\n", test_cases[i].name, test_cases[i].description); + } + printf("\nExamples:\n"); + printf(" %s # Run all tests\n", program_name); + printf(" %s init # Run only initialization test\n", program_name); + printf(" %s init collective # Run initialization and collective tests\n", program_name); + printf(" %s --help # Show this help\n", program_name); +} + +// Find test by name +TestFunction find_test(const char* name) { + for (int i = 0; test_cases[i].name != NULL; i++) { + if (strcmp(test_cases[i].name, name) == 0) { + return test_cases[i].func; + } + } + return NULL; +} + +// Main test runner +int main(int argc, char* argv[]) { + int passed = 0, total = 0; + + // Check for help + if (argc > 1 && (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0)) { + show_help(argv[0]); + return 0; + } + + printf("Running NCCL Tuner Plugin Unit Tests\n"); + printf("=====================================\n"); + + if (argc == 1) { + // No arguments - run all tests + for (int i = 0; test_cases[i].name != NULL; i++) { + total++; + passed += test_cases[i].func(); + } + } else { + // Run specific tests + for (int arg = 1; arg < argc; arg++) { + TestFunction test_func = find_test(argv[arg]); + if (test_func) { + total++; + passed += test_func(); + } else { + printf("ERROR: Unknown test '%s'\n", argv[arg]); + printf("Use --help to see available tests\n"); + return 1; + } + } + } + + printf("\n=====================================\n"); + printf("Test Results: %d/%d tests passed\n", passed, total); + + if (passed == total) { + printf("All tests PASSED!\n"); + return 0; + } else { + printf("Some tests FAILED!\n"); + return 1; + } +} diff --git a/makefiles/common.mk b/makefiles/common.mk index 8a35a8fab..6ba9bbfce 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -40,10 +40,12 @@ ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0) CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35 endif CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70 +CUDA10_GENCODE = -gencode=arch=compute_75,code=sm_75 CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80 CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90 -CUDA13_GENCODE = -gencode=arch=compute_100,code=sm_100 \ - -gencode=arch=compute_120,code=sm_120 +CUDA12_8_GENCODE = -gencode=arch=compute_100,code=sm_100 \ + -gencode=arch=compute_120,code=sm_120 +CUDA13_GENCODE = -gencode=arch=compute_110,code=sm_110 CUDA8_PTX = -gencode=arch=compute_61,code=compute_61 CUDA9_PTX = -gencode=arch=compute_70,code=compute_70 @@ -53,10 +55,10 @@ CUDA13_PTX = -gencode=arch=compute_120,code=compute_120 ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0) # Prior to SM75 is deprecated from CUDA13.0 onwards - NVCC_GENCODE ?= $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX) + NVCC_GENCODE ?= $(CUDA10_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_8_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX) else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8; echo $$?),0) # Include Blackwell support if we're using CUDA12.8 or above - NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX) + NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_8_GENCODE) $(CUDA13_PTX) else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0) # Include Hopper support if we're using CUDA11.8 or above NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX) diff --git a/makefiles/version.mk b/makefiles/version.mk index f41e7a783..013e972f3 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 27 -NCCL_PATCH := 3 +NCCL_PATCH := 5 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/device/Makefile b/src/device/Makefile index df58489a0..67ab176ca 100644 --- a/src/device/Makefile +++ b/src/device/Makefile @@ -36,9 +36,8 @@ define COMPILE $(call COMPILE$(or $3,$(suffix $2)),$1,$2) endef -ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12080))"),1) - NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100a,code=sm_100a \ - -gencode=arch=compute_120a,code=sm_120a +ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12090))"),1) + NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100f,code=sm_100f else ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12070))"),1) NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100a,code=sm_100a else diff --git a/src/device/reduce_kernel.h b/src/device/reduce_kernel.h index 0d054bb2d..d36dfe5a7 100644 --- a/src/device/reduce_kernel.h +++ b/src/device/reduce_kernel.h @@ -1009,7 +1009,7 @@ struct Apply_LoadMultimem { DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(__nv_bfloat16, bf16x2, 4) #endif - #if NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1000 || NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1010 || NCCL_CUDA_ARCH_SPECIFIC == 1200 || NCCL_CUDA_ARCH_SPECIFIC == 1210 + #if NCCL_CUDA_ARCH_SPECIFIC == 1000 || NCCL_CUDA_ARCH_SPECIFIC == 1010 || NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1000 || NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1010 || NCCL_CUDA_ARCH_SPECIFIC == 1200 || NCCL_CUDA_ARCH_SPECIFIC == 1210 DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(__nv_fp8_e4m3, e4m3x4, 4) DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(__nv_fp8_e4m3, e4m3x4, 4) DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(__nv_fp8_e5m2, e5m2x4, 4) diff --git a/src/device/symmetric/generate.py b/src/device/symmetric/generate.py index f630ff072..8fcb9a425 100755 --- a/src/device/symmetric/generate.py +++ b/src/device/symmetric/generate.py @@ -108,7 +108,7 @@ def required_cuda(k): if k.algo in ldmc_algos: cudart = 12070 arch = None - specific_sms = [100, 120] + specific_sms = ["100a", "101a", "100f", "101f", "120a", "121a"] return (cudart, arch, specific_sms) ################################################################################ @@ -145,7 +145,7 @@ def kernel_conds(k): if not specific_sms: arch_cond = "__CUDA_ARCH__ >= %d"%arch else: - arch_cond = " || ".join(["0"] + ["NCCL_CUDA_ARCH_SPECIFIC==%d"%(10*sm) for sm in specific_sms]) + arch_cond = " || ".join(["0"] + ["NCCL_CUDA_ARCH_%sSPECIFIC==%d"%("FAMILY_" if sm[-1] == "f" else "", 10*int(sm.replace('a', '').replace('f', ''))) for sm in specific_sms]) return cudart_cond, arch_cond def instantiate(k): diff --git a/src/graph/paths.cc b/src/graph/paths.cc index bc5cc755e..4b44abd01 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -175,6 +175,13 @@ ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu return ncclSuccess; } +static int mergePathType(int type0, int type1){ + int max = std::max(type0,type1); + int min = std::min(type0,type1); + if(max == PATH_PHB && min == PATH_C2C) return PATH_P2C; + else return max; +} + static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) { struct ncclTopoNode* cpuNode = system->nodes[tx].nodes+ix; struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1; @@ -187,7 +194,7 @@ static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix, // Update path characteristics srcNode->paths[t2][i2].count = l; - srcNode->paths[t2][i2].type = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type); + srcNode->paths[t2][i2].type = mergePathType(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type); if (tx == GPU) srcNode->paths[t2][i2].type = PATH_PXN; srcNode->paths[t2][i2].bw = std::min(srcNode->paths[tx][ix].bw, cpuNode->paths[t2][i2].bw); return ncclSuccess; @@ -674,9 +681,9 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm int c; NCCLCHECK(ncclGetLocalCpu(system, g, &c)); if (c == -1) continue; - if (gpuNode->paths[NET][n].type == PATH_PHB && gpuNode->paths[CPU][c].type == PATH_C2C) { - gpuNode->paths[NET][n].type = PATH_P2C; - netNode->paths[GPU][g].type = PATH_P2C; + if (mergePathType(gpuNode->paths[CPU][c].type, netNode->paths[CPU][c].type) == PATH_P2C) { + gpuNode->paths[NET][n].type = std::min(PATH_P2C, gpuNode->paths[NET][n].type); + netNode->paths[GPU][g].type = std::min(PATH_P2C, netNode->paths[GPU][g].type); } } } @@ -695,16 +702,15 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm // PXN = PCI + NVLink. struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+localGpuIndex; // Only use PXN for NIC n if remote GPU p ... - if (/* (1) is either connected to the NIC with PXB*/ - (peerNode->paths[NET][n].type <= PATH_PXB || - /* or with P2C and PxN over C2C is enabled */ - (ncclParamPxnC2c() && peerNode->paths[NET][n].type == PATH_P2C)) && + int pxnType = ncclParamPxnC2c() ? PATH_P2C : PATH_PXB; + if (/* (1) is connected to the NIC with PxN type*/ + peerNode->paths[NET][n].type <= pxnType && /* and (2) is connected to us through NVLink */ peerNode->paths[GPU][g].type <= PATH_NVL && /* and (3) is on the same node as us */ NCCL_TOPO_ID_SYSTEM_ID(peerNode->id) == NCCL_TOPO_ID_SYSTEM_ID(gpu->id) && /* and (4) has either higher bw to that NIC or avoid going through the CPU*/ - (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || gpu->paths[NET][n].type > PATH_PXB)) + (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || gpu->paths[NET][n].type > pxnType)) // We can use that GPU as relay to communicate with that NIC. // Only enabling it in the GPU->NIC direction for now to favor // receiving locally and sending remotely (consistent with net.cc) @@ -725,6 +731,12 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm } } } + + // Pre-compute NET local gpus to accelerate search + for (int n=0; nnodes[NET].count; n++) { + struct ncclTopoNode* net = system->nodes[NET].nodes+n; + NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &net->net.localGpu)); + } return ncclSuccess; } diff --git a/src/graph/search.cc b/src/graph/search.cc index 9d8ad3ff8..67e600906 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -437,6 +437,65 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop return ncclSuccess; } +// Add the preferred NICs ordered by GPU first +static ncclResult_t ncclTopoPrefNetsGpuFirst(struct ncclTopoSystem* system, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCount) { + const int nGpus = (gpu == -1) ? system->nodes[GPU].count : 1; + int gpuCount = nGpus; + int gpuIds[NCCL_TOPO_MAX_NODES] = {gpu}; + int firstNets[NCCL_TOPO_MAX_NODES]; + if (gpu == -1) + for (int g = 0; g < nGpus; g++) gpuIds[g] = g; + + for (int c = 0; c < MAXCHANNELS; c++) { + for (int g = 0; g < nGpus; g++) { + if (gpuIds[g] == -1) continue; + int localNet; + int64_t netId; + struct ncclTopoNode* gpu = system->nodes[GPU].nodes + gpuIds[g]; + NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL)); + NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &localNet)); + // store the first net found for each GPU in case of duplicates + if(c == 0) firstNets[g] = localNet; + // if the NET has already been returned for channel 0, that GPU is done + if (c > 0 && firstNets[g] == localNet) { + gpuIds[g] = -1; + gpuCount--; + continue; + } + // only add it to the list if it doesn't already exist + int found = 0; + while (found < (*netCount) && nets[found] != localNet) found++; + if (found == (*netCount)) nets[(*netCount)++] = localNet; + } + if (gpuCount == 0) break; + } + return ncclSuccess; +} + +// Add the preferred NICs ordered by channels first +static ncclResult_t ncclTopoPrefNetsChannelFirst(struct ncclTopoSystem* system, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCount) { + for (int g = 0; g < system->nodes[GPU].count; g++) { + if (gpu != -1 && gpu != g) continue; + int localNetCount = 0, localNets[MAXCHANNELS]; + struct ncclTopoNode* gpu = system->nodes[GPU].nodes + g; + for (int c = 0; c < MAXCHANNELS; c++) { + int64_t netId; + NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL)); + NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets + localNetCount)); + if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break; + localNetCount++; + } + // Append NICs to list + for (int i = 0; i < localNetCount; i++) { + int n = localNets[i]; + int found = 0; + while (found < (*netCount) && nets[found] != n) found++; + if (found == (*netCount)) nets[(*netCount)++] = n; + } + } + return ncclSuccess; +} + // Build a sorted list of the NETs to try. // // "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu @@ -445,39 +504,25 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop // The list is built the following way: // 1. Select NETs starting with those close to GPU(s), based on paths[n].type. // 2. add other NETs satisfying typeInter but not already in the list. - +NCCL_PARAM(ScatterEnable, "MNNVL_SCATTER_NETS_ENABLE", 1); ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet) { ncclResult_t ret = ncclSuccess; int netCount = 0; - int localNetCount; - int localNets[MAXCHANNELS]; - // First add the preferred NICs - for (int g=0; gnodes[GPU].count; g++) { - if (gpu != -1 && gpu != g) continue; - localNetCount = 0; - struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; - for (int c = 0; cgpu.rank, c, &netId, NULL)); - NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount)); - if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break; - localNetCount++; - } - // Append NICs to list - for (int i=0; inHosts > 1 && ncclParamScatterEnable()) { + // For MNNVL systems, we sort the devices by GPU first, then by channel + NCCLCHECK(ncclTopoPrefNetsGpuFirst(system, gpu, nets, &netCount)); + } else { + // For other systems, we sort the devices by channel first, then by GPU + NCCLCHECK(ncclTopoPrefNetsChannelFirst(system, gpu, nets, &netCount)); } // Then add others satisfying typeInter for (int t=0; t <= typeInter; t++) { - for (int g=0; gnodes[GPU].count; g++) { + for (int g = 0; g < system->nodes[GPU].count; g++) { if (gpu != -1 && gpu != g) continue; - localNetCount = 0; + int localNetCount = 0, localNets[MAXCHANNELS]; struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; struct ncclTopoLinkList* paths = gpu->paths[NET]; for (int n=0; nnodes[NET].count && npattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) { // NVLS search only tries to find NIC:GPU combinations to compute the heads. if (graph->nChannels < netCount) { - int gpu; - NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu)); + int gpu = net->net.localGpu; if (gpu != -1) { int duplicate = 0; // check whether there is duplicate head when one GPU connects with multiple NICs @@ -643,13 +687,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo } } } else { - if (graph->nChannels > 0) { + if (graph->nChannels > 0 && graph->sameChannels == 1) { // Try to replay the last channel int g; NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g)); NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g)); - } - if (graph->nChannels == 0 || graph->sameChannels == 0) { + } else { if (graph->nChannels == 0 && system->nodes[NVS].count == 0) { // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long int t = 1 << 10; @@ -658,11 +701,16 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo } // Then try the most local GPUs + int localGpu = net->net.localGpu; + if (localGpu != -1) { + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, localGpu)); + } int localGpus[NCCL_TOPO_MAX_NODES], localGpuCount, pathType; NCCLCHECK(ncclTopoGetLocal(system, NET, n, GPU, localGpus, &localGpuCount, &pathType)); // if no GPUs are connected, skip this net if (pathType == PATH_DIS) continue; for (int g = 0; g < localGpuCount; ++g) { + if (localGpus[g] == localGpu) continue; // We already tried this one NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, localGpus[g])); } } @@ -749,8 +797,8 @@ struct kvDict kvDictLinkType[] = { { "NVB", PATH_NVB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, - { "PXN", PATH_PXN }, { "P2C", PATH_P2C }, + { "PXN", PATH_PXN }, { "PHB", PATH_PHB }, { "SYS", PATH_SYS }, { NULL, 0 } @@ -798,8 +846,10 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels)); NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->bwIntra)); NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->bwInter)); - if (xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != ncclSuccess) graph->latencyInter = 0.0; const char* str; + NCCLCHECK(xmlGetAttr(xmlGraph, "latencyinter", &str)); + if (!str) INFO(NCCL_GRAPH, "latencyinter not found in graph, using 0.0"); + graph->latencyInter = str ? strtof(str, NULL) : 0.0; NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str)); NCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType)); NCCLCHECK(xmlGetAttr(xmlGraph, "typeinter", &str)); @@ -910,7 +960,7 @@ float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, #define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float)) float sm100SpeedArrayIntra[] = { 90.0, 80.0, 70.0, 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 19.0, 18.0 }; -float sm100SpeedArrayInter[] = { 47.9, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; +float sm100SpeedArrayInter[] = { 48.0, 45.1, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; #define NSPEEDSINTRA_SM100 (sizeof(sm100SpeedArrayIntra)/sizeof(float)) #define NSPEEDSINTER_SM100 (sizeof(sm100SpeedArrayInter)/sizeof(float)) @@ -1136,8 +1186,12 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr offset = strlen(line); } for (int i=0; iintra[ngpus*c+i]); + int g; + ncclTopoRankToIndex(system, graph->intra[ngpus * c + i], &g, true); + int64_t topoId = system->nodes[GPU].nodes[g].id; + sprintf(line + offset, " %s/%lx-%lx", topoNodeTypeStr[GPU], NCCL_TOPO_ID_SYSTEM_ID(topoId), NCCL_TOPO_ID_LOCAL_ID(topoId)); offset = strlen(line); + if (graph->id == 3) break; // NVLS graphs only use the first GPU } if (system->nodes[NET].count > 0) { sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c+1])); diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 9fe81bbcd..8fdf54ea4 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -21,7 +21,7 @@ const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" }; const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "C2C", "PCI", "", "", "", "", "SYS", "NET" }; -const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "PXN", "P2C", "PHB", "SYS", "NET", "DIS" }; +const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "P2C", "PXN", "PHB", "SYS", "NET", "DIS" }; /******************************************************************/ /******************* Graph Creation Functions *********************/ @@ -677,7 +677,14 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem struct ncclXmlNode* node = topNode->subs[s]; if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem)); } - for (int systemId=0; systemIdnHosts; systemId++) if (system->hostHashes[systemId] == localHostHash) system->systemId = systemId; + + int systemId = 0; + while (systemId < system->nHosts && system->hostHashes[systemId] != localHostHash) systemId++; + system->systemId = systemId; + if(systemId == system->nHosts){ + WARN("localHostHash = 0x%lx not found in the list of system hostHashes",localHostHash); + return ncclInvalidArgument; + } NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL, 0)); NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL, 0)); @@ -1143,8 +1150,8 @@ struct kvDict nicPathKvList[] = { { "PORT", PATH_PORT }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, - { "PXN", PATH_PXN }, { "P2C", PATH_P2C }, + { "PXN", PATH_PXN }, { "PHB", PATH_PHB }, { "SYS", PATH_SYS }, { NULL, 0 } @@ -1421,7 +1428,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy } // Only update our topo tracking structure if we aren't dumping (separate steps) - if (dumpXmlFile == NULL) NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail); + if (dumpXmlFile == NULL) NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, getHostHash()), ret, fail); exit: if (!comm->MNNVL && localRanks) free(localRanks); diff --git a/src/graph/topo.h b/src/graph/topo.h index 07ef5e105..9b49c0222 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -18,7 +18,7 @@ #define SM80_NVLINK_BW 20.0 #define SM90_NVLINK_BW 20.6 #define SM86_NVLINK_BW 12.0 -#define SM100_NVLINK_BW 40.0 +#define SM100_NVLINK_BW 40.1 #define PCI_BW 12.0 // PCI Gen3 x16 #define AMD_BW 16.0 #define BDW_QPI_BW 6.0 @@ -76,11 +76,11 @@ extern const char* topoLinkTypeStr[]; // Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) #define PATH_PXB 5 -// Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations. -#define PATH_PXN 6 - // Connection between a GPU and a NIC using the C2C connection to the CPU and the PCIe connection to the NIC -#define PATH_P2C 7 +#define PATH_P2C 6 + +// Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations. +#define PATH_PXN 7 // Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) #define PATH_PHB 8 @@ -143,6 +143,7 @@ struct ncclTopoNode { int gdrSupport; int collSupport; int maxChannels; + int localGpu; }net; struct { int arch; diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index 64dc5cf22..8e99f18c3 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -455,9 +455,16 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom for (int c=0; ctypeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= (ncclParamLl128C2c() ? PATH_P2C : PATH_PXN))); + if (ncclParamLl128C2c() && minCompCap >= 90) { + // Enable LL128 by default only on Hopper/Blackwell for all connections up to P2C and PXN. + pEnable &= (graphs[a]->typeInter <= PATH_PXN); + } else { + // Enable LL128 only up to PXB. Don't enable LL128 over PxN because PxN can encapsulate PxB or P2C links. + pEnable &= (graphs[a]->typeInter <= PATH_PXB); + if (!ncclParamLl128C2c() && minCompCap >= 90) + INFO(NCCL_GRAPH, "Disabling LL128 over all PxN connections (PXB and C2C). This ensures that no C2C link will be used by LL128."); + } pEnable &= (graphs[a]->typeIntra <= PATH_NVB); pEnable &= (minCompCap == maxCompCap); pEnable &= !(minCompCap < 70 || (minCompCap == 90 && CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2)); diff --git a/src/init.cc b/src/init.cc index 83764a883..2a57c46c0 100644 --- a/src/init.cc +++ b/src/init.cc @@ -1507,7 +1507,7 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) { int minCTAsEnv; int maxCTAsEnv; int splitShareEnv; - int collnetEnableEnv; + const char* collnetEnableEnv; int ctaPolicyEnv; int shrinkShareEnv; int nvlsCTAsEnv; @@ -1561,9 +1561,15 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) { comm->config.shrinkShare = shrinkShareEnv; } - collnetEnableEnv = ncclParamCollnetEnable(); - if (collnetEnableEnv != NCCL_CONFIG_UNDEF_INT) { - comm->config.collnetEnable = collnetEnableEnv; + // NCCL_COLLNET_ENABLE needs to be reloaded each time for comm init + // since users might change the env on the fly to enable/disable collnet + collnetEnableEnv = ncclGetEnv("NCCL_COLLNET_ENABLE"); + if (collnetEnableEnv != NULL) { + int collnetEnableInt = (int)strtol(collnetEnableEnv, NULL, 0); + if (collnetEnableInt != NCCL_CONFIG_UNDEF_INT) { + comm->config.collnetEnable = collnetEnableInt; + INFO(NCCL_ENV, "NCCL_COLLNET_ENABLE set by environment to %d.", collnetEnableInt); + } } ctaPolicyEnv = ncclParamCtaPolicy(); diff --git a/src/misc/mlx5dvsymbols.cc b/src/misc/mlx5dvsymbols.cc index 5bb4109f3..47cc4eb0d 100644 --- a/src/misc/mlx5dvsymbols.cc +++ b/src/misc/mlx5dvsymbols.cc @@ -52,6 +52,9 @@ ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols) { #define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do { \ cast = (void**)&funcptr; \ *cast = dlvsym(handle, symbol, version); \ + if (*cast == NULL) { \ + INFO(NCCL_NET, "dlvsym failed on %s - %s version %s", symbol, dlerror(), version); \ + } \ } while (0) LOAD_SYM(mlx5dvhandle, "mlx5dv_is_supported", mlx5dvSymbols->mlx5dv_internal_is_supported); diff --git a/src/misc/strongstream.cc b/src/misc/strongstream.cc index 0adb4b137..1766f4167 100644 --- a/src/misc/strongstream.cc +++ b/src/misc/strongstream.cc @@ -21,7 +21,6 @@ struct ncclStrongStreamCapture { cudaGraph_t graph; unsigned long long graphId; cudaStream_t captureStream; - cudaGraphNode_t lastRecord; void* acquiredBy; }; @@ -216,7 +215,6 @@ ncclResult_t ncclStrongStreamAcquire( CUDACHECKGOTO(cudaStreamCreateWithFlags(&cap->captureStream, cudaStreamNonBlocking), ret, do_unlock); } cap->graphId = graph.graphId; - cap->lastRecord = nullptr; cap->acquiredBy = localThreadId(); // Push to capturing list. cap->next = ss->captureHead; @@ -296,16 +294,6 @@ ncclResult_t ncclStrongStreamRelease( cudaGraphNode_t recordNode; CUDACHECK(cudaGraphAddEventRecordNode(&recordNode, graph.graph, nullptr, 0, ss->serialEvent)); - // Make this record order after previous record on this stream. - if (cap->lastRecord != nullptr) { - #if CUDART_VERSION >= 13000 - CUDACHECK(cudaGraphAddDependencies_v2(graph.graph, &cap->lastRecord, &recordNode, nullptr, 1)); - #else - CUDACHECK(cudaGraphAddDependencies(graph.graph, &cap->lastRecord, &recordNode, 1)); - #endif - } - cap->lastRecord = recordNode; - // Get current nodes from work stream so we can add them as dependencies. cudaStreamCaptureStatus status; cudaGraphNode_t const* nodes; @@ -338,6 +326,22 @@ ncclResult_t ncclStrongStreamRelease( } } + // Make every future operation captured on cap->captureStream depend on 'recordNode'. + #if CUDART_VERSION >= 13000 + CUDACHECK(cudaStreamUpdateCaptureDependencies_v2( + cap->captureStream, + &recordNode, /* dependencies */ + /*edges =*/ nullptr, /* no edge annotations */ + 1, /* count */ + cudaStreamSetCaptureDependencies)); + #else + CUDACHECK(cudaStreamUpdateCaptureDependencies( + cap->captureStream, + &recordNode, + 1, + cudaStreamSetCaptureDependencies)); + #endif + if (cap->acquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) { WARN("%s", launchRaceFatalMsg); return ncclInvalidUsage; diff --git a/src/plugin/plugin_open.cc b/src/plugin/plugin_open.cc index a9c1d0dc0..64c97be39 100644 --- a/src/plugin/plugin_open.cc +++ b/src/plugin/plugin_open.cc @@ -61,20 +61,20 @@ static void* openPluginLib(enum ncclPluginType type, const char* libName) { char eNoEntNameList[PATH_MAX] = { 0 }; if (libName && strlen(libName)) { - // match names that start with 'lib' and end with '.so' - if (strlen(libName) >= strlen("libX.so") && strncmp(libName, "lib", strlen("lib")) == 0 && strncmp(libName + strlen(libName) - strlen(".so"), ".so", strlen(".so")) == 0) { - snprintf(libName_, MAX_STR_LEN, "%s", libName); - libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); - if (libHandles[type]) { - INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_); - return libHandles[type]; - } - if (openErr == ENOENT) { - appendNameToList(eNoEntNameList, &len, libName_); - } else { - INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr); - } + snprintf(libName_, MAX_STR_LEN, "%s", libName); + libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); + if (libHandles[type]) { + INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_); + return libHandles[type]; + } + if (openErr == ENOENT) { + appendNameToList(eNoEntNameList, &len, libName_); } else { + INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr); + } + + // libName can't be a relative or absolute path (start with '.' or contain any '/'). It can't be a library name either (start with 'lib' or end with '.so') + if (strchr(libName, '/') == nullptr && (strncmp(libName, "lib", strlen("lib")) || strlen(libName) < strlen(".so") || strncmp(libName + strlen(libName) - strlen(".so"), ".so", strlen(".so")))) { snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName); libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); if (libHandles[type]) { diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 19a505e1c..40897d93f 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -494,7 +494,9 @@ static int ibvSpeeds[] = { 14000, /* FDR */ 25000, /* EDR */ 50000, /* HDR */ - 100000 /* NDR */ }; + 100000, /* NDR */ + 200000 /* XDR */ +}; static int firstBitSet(int val, int max) { int i = 0; @@ -654,7 +656,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr ibProvider = IB_PROVIDER_MLX5; snprintf(dataDirectDevicePath, PATH_MAX, "/sys"); if((ncclMlx5dvDmaBufCapable(context)) && (wrap_mlx5dv_get_data_direct_sysfs_path(context, dataDirectDevicePath + 4, PATH_MAX - 4) == ncclSuccess)) { - INFO(NCCL_NET, "Data Direct DMA Interface is detected for device:%s", devices[d]->name); + INFO(NCCL_INIT|NCCL_NET, "Data Direct DMA Interface is detected for device:%s", devices[d]->name); if(ncclParamIbDataDirect()) dataDirectSupported = 1; } } From 7c12c627c62ef4e5a2485777a8d9dce58f3f562f Mon Sep 17 00:00:00 2001 From: Kamil Iskra Date: Fri, 11 Jul 2025 07:32:13 -0700 Subject: [PATCH 13/21] NCCL 2.27.6-1 Improve support for DirectNIC (CX8) * Add support for XDR speed detection. * When DirectNIC is enabled, report only the RDMA interfaces. Extend the P2C (PXN over C2C) support to send/receive operations. Support compilation with GCC 14 (Issues #1743, #1751). Fix the unloading of network plugins that also provide tuner capability. Fix the change of the current device across the calls to ncclCommDestroy() and ncclCommAbort(). A note for users on MNNVL systems: please ensure an adequate stack size for NCCL threads. While the default Linux stack size limit of 8192 KB is known to be sufficient, we've seen crashes if the limit is changed to "unlimited", as it causes the glibc library to unexpectedly *decrease* the stack size of NCCL's background threads to just 2048 KB. Use "ulimit -s" in bash to print the current limit; if needed, reset it to 8192 KB using "ulimit -s 8192" (one also needs to ensure that the new setting is propagated to other nodes when launching a multi-node NCCL job). --- makefiles/common.mk | 2 +- makefiles/version.mk | 2 +- src/graph/paths.cc | 4 +- src/graph/search.cc | 5 ++- src/graph/topo.h | 2 + src/include/ibvcore.h | 76 ++++++++++++++++++++++++++----------- src/include/plugin/plugin.h | 10 ++++- src/init.cc | 8 +++- src/misc/ibvwrap.cc | 10 ++++- src/misc/socket.cc | 3 +- src/plugin/net.cc | 5 ++- src/plugin/plugin_open.cc | 38 +++++++++---------- src/plugin/profiler.cc | 5 ++- src/plugin/tuner.cc | 5 ++- src/transport/net_ib.cc | 19 +++++++--- 15 files changed, 128 insertions(+), 66 deletions(-) diff --git a/makefiles/common.mk b/makefiles/common.mk index 6ba9bbfce..0f01671b6 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -76,7 +76,7 @@ $(info NVCC_GENCODE is ${NVCC_GENCODE}) ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0) CXXSTD ?= -std=c++17 else - CXXSTD ?= -std=c++11 + CXXSTD ?= -std=c++14 endif CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \ diff --git a/makefiles/version.mk b/makefiles/version.mk index 013e972f3..0f482d31a 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 27 -NCCL_PATCH := 5 +NCCL_PATCH := 6 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 4b44abd01..82c0d9972 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -709,8 +709,8 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm peerNode->paths[GPU][g].type <= PATH_NVL && /* and (3) is on the same node as us */ NCCL_TOPO_ID_SYSTEM_ID(peerNode->id) == NCCL_TOPO_ID_SYSTEM_ID(gpu->id) && - /* and (4) has either higher bw to that NIC or avoid going through the CPU*/ - (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || gpu->paths[NET][n].type > pxnType)) + /* and (4) has either higher bw to that NIC or avoid going through the CPU (path.type is > PATH_PXN)*/ + (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || gpu->paths[NET][n].type > PATH_PXN)) // We can use that GPU as relay to communicate with that NIC. // Only enabling it in the GPU->NIC direction for now to favor // receiving locally and sending remotely (consistent with net.cc) diff --git a/src/graph/search.cc b/src/graph/search.cc index 67e600906..86199d78b 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -960,7 +960,7 @@ float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, #define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float)) float sm100SpeedArrayIntra[] = { 90.0, 80.0, 70.0, 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 19.0, 18.0 }; -float sm100SpeedArrayInter[] = { 48.0, 45.1, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; +float sm100SpeedArrayInter[] = { 96.0, 48.0, 45.1, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; #define NSPEEDSINTRA_SM100 (sizeof(sm100SpeedArrayIntra)/sizeof(float)) #define NSPEEDSINTER_SM100 (sizeof(sm100SpeedArrayInter)/sizeof(float)) @@ -1307,7 +1307,8 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG NCCLCHECK(ncclTopoGetLocalGpu(comm->topo, netId, &g2)); if (g2 != -1) { struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2; - if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) { + int pxnType = ncclParamPxnC2c() ? PATH_P2C : PATH_PXB; + if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= pxnType) { *proxyRank = peerGpu->gpu.rank; if (dev) *dev = netDev; if (id) *id = netId; diff --git a/src/graph/topo.h b/src/graph/topo.h index 9b49c0222..9ef10ff2d 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -98,6 +98,8 @@ extern const char* topoLinkTypeStr[]; #define PATH_DIS 11 extern const char* topoPathTypeStr[]; +extern int64_t ncclParamPxnC2c(); + struct ncclTopoNode; struct ncclTopoLink { int type; diff --git a/src/include/ibvcore.h b/src/include/ibvcore.h index 8d8ecf1ec..ae9051f28 100644 --- a/src/include/ibvcore.h +++ b/src/include/ibvcore.h @@ -9,6 +9,7 @@ #include #include #include +#include #if __GNUC__ >= 3 # define __attribute_const __attribute__((const)) @@ -39,7 +40,7 @@ union ibv_gid { #define vext_field_avail(type, fld, sz) (offsetof(type, fld) < (sz)) /*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/ -//static void *__VERBS_ABI_IS_EXTENDED = ((uint8_t *)NULL) - 1; +static void *__VERBS_ABI_IS_EXTENDED = ((uint8_t *)NULL) - 1; enum ibv_node_type { IBV_NODE_UNKNOWN = -1, @@ -208,7 +209,9 @@ struct ibv_port_attr { uint8_t active_speed; uint8_t phys_state; uint8_t link_layer; - uint8_t reserved; + uint8_t flags; + uint16_t port_cap_flags2; + uint32_t active_speed_ex; }; enum ibv_event_type { @@ -993,37 +996,50 @@ enum verbs_context_mask { struct verbs_context { /* "grows up" - new fields go here */ - int (*_reserved_2) (void); - int (*destroy_flow) (struct ibv_flow *flow); - int (*_reserved_1) (void); - struct ibv_flow * (*create_flow) (struct ibv_qp *qp, - struct ibv_flow_attr *flow_attr); + int (*query_port)(struct ibv_context *context, uint8_t port_num, + struct ibv_port_attr *port_attr, + size_t port_attr_len); + int (*_reserved[25]) (void); + struct verbs_ex_private *priv; + int (*query_device_ex)(struct ibv_context *context, + const struct ibv_query_device_ex_input *input, + struct ibv_device_attr_ex *attr, + size_t attr_size); + int (*ibv_destroy_flow) (struct ibv_flow *flow); + void (*ABI_placeholder2) (void); /* DO NOT COPY THIS GARBAGE */ + struct ibv_flow * (*ibv_create_flow) (struct ibv_qp *qp, + struct ibv_flow_attr *flow_attr); + void (*ABI_placeholder1) (void); /* DO NOT COPY THIS GARBAGE */ struct ibv_qp * (*open_qp)(struct ibv_context *context, struct ibv_qp_open_attr *attr); struct ibv_qp * (*create_qp_ex)(struct ibv_context *context, struct ibv_qp_init_attr_ex *qp_init_attr_ex); int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num); - struct ibv_srq * (*create_srq_ex)(struct ibv_context *context, - struct ibv_srq_init_attr_ex *srq_init_attr_ex); - struct ibv_xrcd * (*open_xrcd)(struct ibv_context *context, - struct ibv_xrcd_init_attr *xrcd_init_attr); - int (*close_xrcd)(struct ibv_xrcd *xrcd); - uint64_t has_comp_mask; - size_t sz; /* Must be immediately before struct ibv_context */ - struct ibv_context context;/* Must be last field in the struct */ + struct ibv_srq * (*create_srq_ex)(struct ibv_context *context, + struct ibv_srq_init_attr_ex *srq_init_attr_ex); + struct ibv_xrcd * (*open_xrcd)(struct ibv_context *context, + struct ibv_xrcd_init_attr *xrcd_init_attr); + int (*close_xrcd)(struct ibv_xrcd *xrcd); + uint64_t _ABI_placeholder3; + size_t sz; /* Must be immediately before struct ibv_context */ + struct ibv_context context; /* Must be last field in the struct */ }; -/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/ -/*static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx) +static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx) { - return (!ctx || (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED)) ? - NULL : container_of(ctx, struct verbs_context, context); + if (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED) + return NULL; + + /* open code container_of to not pollute the global namespace */ + return (struct verbs_context *)(((uintptr_t)ctx) - + offsetof(struct verbs_context, + context)); } #define verbs_get_ctx_op(ctx, op) ({ \ - struct verbs_context *_vctx = verbs_get_ctx(ctx); \ - (!_vctx || (_vctx->sz < sizeof(*_vctx) - offsetof(struct verbs_context, op)) || \ - !_vctx->op) ? NULL : _vctx; })*/ + struct verbs_context *__vctx = verbs_get_ctx(ctx); \ + (!__vctx || (__vctx->sz < sizeof(*__vctx) - offsetof(struct verbs_context, op)) || \ + !__vctx->op) ? NULL : __vctx; }) #define verbs_set_ctx_op(_vctx, op, ptr) ({ \ struct verbs_context *vctx = _vctx; \ @@ -1055,4 +1071,20 @@ struct ibv_ece { uint32_t comp_mask; }; +/** + * ibv_query_port_ex - Get (extended) port properties + */ +static inline int ibv_query_port_ex(struct ibv_context *context, + uint8_t port_num, + struct ibv_port_attr *port_attr) +{ + struct verbs_context *vctx = verbs_get_ctx_op(context, query_port); + + if (vctx) { + return vctx->query_port(context, port_num, port_attr, sizeof(*port_attr)); + } + + return -1; +} + #endif // NCCL_IBV_CORE_H_ diff --git a/src/include/plugin/plugin.h b/src/include/plugin/plugin.h index 7336c34d9..300e436a0 100644 --- a/src/include/plugin/plugin.h +++ b/src/include/plugin/plugin.h @@ -9,10 +9,16 @@ #include "nccl.h" +enum ncclPluginType { + ncclPluginTypeNet, + ncclPluginTypeTuner, + ncclPluginTypeProfiler, +}; + void* ncclOpenNetPluginLib(const char* name); void* ncclOpenTunerPluginLib(const char* name); void* ncclOpenProfilerPluginLib(const char* name); -void* ncclGetNetPluginLib(void); -ncclResult_t ncclClosePluginLib(void* handle); +void* ncclGetNetPluginLib(enum ncclPluginType type); +ncclResult_t ncclClosePluginLib(void* handle, enum ncclPluginType type); #endif diff --git a/src/init.cc b/src/init.cc index 2a57c46c0..af784c02d 100644 --- a/src/init.cc +++ b/src/init.cc @@ -2170,6 +2170,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) { NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev)); TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId); + NCCLCHECK(ncclGroupStartInternal()); // Try and prevent a double free of the comm struct (user error) if (comm->rank == -1 || comm->nRanks == -1 || comm->cudaDev == -1 || comm->busId == -1) { WARN("comm %p has already been destroyed", comm); @@ -2184,6 +2185,8 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) { NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail); exit: + ncclGroupErrCheck(res); + NCCLCHECK(ncclGroupEndInternal()); return res; fail: goto exit; @@ -2207,6 +2210,7 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) { if (comm == NULL) { return ncclSuccess; } + NCCLCHECK(ncclGroupStartInternal()); // Ask anything that might still be running on the device to quit NCCLCHECK(setCommAbortFlags(comm,1)); comm->destroyFlag = 1; @@ -2229,7 +2233,9 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) { NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail); exit: - return ncclSuccess; + ncclGroupErrCheck(res); + NCCLCHECK(ncclGroupEndInternal()); + return res; fail: goto exit; } diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc index 23bf5e125..59f52e320 100644 --- a/src/misc/ibvwrap.cc +++ b/src/misc/ibvwrap.cc @@ -142,8 +142,14 @@ ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_devic IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_device, ibv_internal_query_device(context, device_attr), 0, "ibv_query_device"); } -ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ - IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port"); +ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { + // First try and query the extended port attributes (e.g. active_speed_ex) + if (ibv_query_port_ex(context, port_num, port_attr) != 0) { + // Fall back to the original attribute API call, but zero all members first + memset(port_attr, 0, sizeof(*port_attr)); + IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port"); + } + return ncclSuccess; } ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid) { diff --git a/src/misc/socket.cc b/src/misc/socket.cc index 278fb5c51..d066d2829 100644 --- a/src/misc/socket.cc +++ b/src/misc/socket.cc @@ -441,7 +441,8 @@ static ncclResult_t socketTryAccept(struct ncclSocket* sock) { if (sock->fd != -1) { sock->state = ncclSocketStateAccepted; } else if (errno == ENETDOWN || errno == EPROTO || errno == ENOPROTOOPT || errno == EHOSTDOWN || - errno == ENONET || errno == EHOSTUNREACH || errno == EOPNOTSUPP || errno == ENETUNREACH) { + errno == ENONET || errno == EHOSTUNREACH || errno == EOPNOTSUPP || errno == ENETUNREACH || + errno == EINTR) { /* per accept's man page, for linux sockets, the following errors might be already pending errors * and should be considered as EAGAIN. To avoid infinite loop in case of errors, we use the retry count*/ if (++sock->errorRetries == ncclParamRetryCnt()) { diff --git a/src/plugin/net.cc b/src/plugin/net.cc index 78944106a..aa80c12ab 100644 --- a/src/plugin/net.cc +++ b/src/plugin/net.cc @@ -67,7 +67,7 @@ static pthread_once_t initPluginLibsOnceControl = PTHREAD_ONCE_INIT; static ncclResult_t ncclNetPluginUnload(netPluginLib_t* pluginLib) { if ((pluginLib->dlHandle) && ((pluginLib->ncclNetPluginRefCount) == 0)) { INFO(NCCL_INIT|NCCL_NET, "Unloading plugin %s", pluginLib->name); - NCCLCHECK(ncclClosePluginLib(pluginLib->dlHandle)); + NCCLCHECK(ncclClosePluginLib(pluginLib->dlHandle, ncclPluginTypeNet)); memset(pluginLib, 0, sizeof(netPluginLib_t)); } return ncclSuccess; @@ -105,8 +105,9 @@ static ncclResult_t ncclNetPluginLoad(netPluginLib_t* pluginLib) { return ncclSuccess; fail: if (pluginLib->dlHandle) { - NCCLCHECK(ncclClosePluginLib(pluginLib->dlHandle)); + NCCLCHECK(ncclClosePluginLib(pluginLib->dlHandle, ncclPluginTypeNet)); } + pluginLib->dlHandle = nullptr; pluginLib->ncclNetPluginState = ncclNetPluginStateLoadFailed; pluginLib->ncclCollNetPluginState = ncclNetPluginStateLoadFailed; goto exit; diff --git a/src/plugin/plugin_open.cc b/src/plugin/plugin_open.cc index 64c97be39..f80321c81 100644 --- a/src/plugin/plugin_open.cc +++ b/src/plugin/plugin_open.cc @@ -10,16 +10,12 @@ #include #include "debug.h" +#include "plugin.h" #define MAX_STR_LEN 255 -enum ncclPluginType { - ncclPluginTypeNet, - ncclPluginTypeTuner, - ncclPluginTypeProfiler, -}; - #define NUM_LIBS 3 +static char* libNames[NUM_LIBS]; static void *libHandles[NUM_LIBS]; static const char *pluginNames[NUM_LIBS] = { "NET", "TUNER", "PROFILER" }; static const char *pluginPrefix[NUM_LIBS] = { "libnccl-net", "libnccl-tuner", "libnccl-profiler" }; @@ -65,6 +61,7 @@ static void* openPluginLib(enum ncclPluginType type, const char* libName) { libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); if (libHandles[type]) { INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_); + libNames[type] = strdup(libName_); return libHandles[type]; } if (openErr == ENOENT) { @@ -79,6 +76,7 @@ static void* openPluginLib(enum ncclPluginType type, const char* libName) { libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); if (libHandles[type]) { INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_); + libNames[type] = strdup(libName_); return libHandles[type]; } if (openErr == ENOENT) { @@ -91,6 +89,7 @@ static void* openPluginLib(enum ncclPluginType type, const char* libName) { snprintf(libName_, MAX_STR_LEN, "%s.so", pluginPrefix[type]); libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); if (libHandles[type]) { + libNames[type] = strdup(libName_); return libHandles[type]; } if (openErr == ENOENT) { @@ -120,22 +119,21 @@ void* ncclOpenProfilerPluginLib(const char* name) { return openPluginLib(ncclPluginTypeProfiler, name); } -void* ncclGetNetPluginLib(void) { - return libHandles[ncclPluginTypeNet]; +void* ncclGetNetPluginLib(enum ncclPluginType type) { + if (libNames[ncclPluginTypeNet]) { + // increment the reference counter of the net library + libNames[type] = strdup(libNames[ncclPluginTypeNet]); + libHandles[type] = dlopen(libNames[ncclPluginTypeNet], RTLD_NOW | RTLD_LOCAL); + } + return libHandles[type]; } -ncclResult_t ncclClosePluginLib(void* handle) { - bool found = false; - for (int l=0; lname); - NCCLCHECK(ncclClosePluginLib(profilerPluginLib)); + NCCLCHECK(ncclClosePluginLib(profilerPluginLib, ncclPluginTypeProfiler)); profilerPluginLib = nullptr; ncclProfiler = nullptr; profilerPluginStatus = profilerPluginLoadReady; diff --git a/src/plugin/tuner.cc b/src/plugin/tuner.cc index 443bf78c4..24a59de2e 100644 --- a/src/plugin/tuner.cc +++ b/src/plugin/tuner.cc @@ -52,7 +52,7 @@ ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) { tunerPluginLib = ncclOpenTunerPluginLib(ncclGetEnv("NCCL_TUNER_PLUGIN")); if (nullptr == tunerPluginLib) { - tunerPluginLib = ncclGetNetPluginLib(); + tunerPluginLib = ncclGetNetPluginLib(ncclPluginTypeTuner); if (nullptr == tunerPluginLib) { goto fail; } @@ -78,6 +78,7 @@ ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) { pthread_mutex_unlock(&tunerPluginLock); return ncclSuccess; fail: + if (tunerPluginLib) NCCLCHECK(ncclClosePluginLib(tunerPluginLib, ncclPluginTypeTuner)); tunerPluginLib = nullptr; status = tunerPluginLoadFailed; goto exit; @@ -87,7 +88,7 @@ ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) { pthread_mutex_lock(&tunerPluginLock); if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) { INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name); - NCCLCHECK(ncclClosePluginLib(tunerPluginLib)); + NCCLCHECK(ncclClosePluginLib(tunerPluginLib, ncclPluginTypeTuner)); tunerPluginLib = nullptr; tunerSymbol = nullptr; comm->tuner = nullptr; diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 40897d93f..709e7ad40 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -652,12 +652,15 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr enum ncclIbProvider ibProvider = IB_PROVIDER_NONE; char dataDirectDevicePath[PATH_MAX]; int dataDirectSupported = 0; + int skipNetDevForDataDirect = 0; if (wrap_mlx5dv_is_supported(devices[d])) { ibProvider = IB_PROVIDER_MLX5; snprintf(dataDirectDevicePath, PATH_MAX, "/sys"); if((ncclMlx5dvDmaBufCapable(context)) && (wrap_mlx5dv_get_data_direct_sysfs_path(context, dataDirectDevicePath + 4, PATH_MAX - 4) == ncclSuccess)) { - INFO(NCCL_INIT|NCCL_NET, "Data Direct DMA Interface is detected for device:%s", devices[d]->name); - if(ncclParamIbDataDirect()) dataDirectSupported = 1; + INFO(NCCL_INIT|NCCL_NET, "NET/IB: Data Direct DMA Interface is detected for device:%s", devices[d]->name); + // Now check whether Data Direct has been disabled by the user + if(ncclParamIbDataDirect() == 1) { dataDirectSupported = 1; skipNetDevForDataDirect = 1; } + if(ncclParamIbDataDirect() == 2) { dataDirectSupported = 1; skipNetDevForDataDirect = 0; } } } int nPorts = 0; @@ -669,7 +672,8 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr continue; } for (int port_num = 1; port_num <= devAttr.phys_port_cnt; port_num++) { - for (int dataDirect = 0; dataDirect < 1 + dataDirectSupported; ++dataDirect) { + // dataDirect = 0 exposes the devices normally, dataDirect = 1 exposes the devices through direct NIC + for (int dataDirect = skipNetDevForDataDirect; dataDirect < 1 + dataDirectSupported; ++dataDirect) { struct ibv_port_attr portAttr; if (ncclSuccess != wrap_ibv_query_port(context, port_num, &portAttr)) { WARN("NET/IB : Unable to query port_num %d", port_num); @@ -690,15 +694,18 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr ncclIbDevs[ncclNIbDevs].portAttr = portAttr; ncclIbDevs[ncclNIbDevs].portNum = port_num; ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer; - ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width); + if (portAttr.active_speed_ex) + // A non-zero active_speed_ex indicates XDR rate (0x100) or higher + ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed_ex) * ncclIbWidth(portAttr.active_width); + else + ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width); ncclIbDevs[ncclNIbDevs].context = context; ncclIbDevs[ncclNIbDevs].pdRefs = 0; ncclIbDevs[ncclNIbDevs].pd = NULL; if (!dataDirect) { strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE); NCCLCHECKGOTO(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort), ret, fail); - } - else { + } else { snprintf(ncclIbDevs[ncclNIbDevs].devName, MAXNAMESIZE, "%s_dma", devices[d]->name); NCCLCHECK(ncclCalloc(&ncclIbDevs[ncclNIbDevs].pciPath, PATH_MAX)); strncpy(ncclIbDevs[ncclNIbDevs].pciPath, dataDirectDevicePath, PATH_MAX); From bfedf2629eae7abbcb7b9bd4841723b21f725636 Mon Sep 17 00:00:00 2001 From: Stephen Sachs Date: Wed, 16 Jul 2025 17:56:12 +0200 Subject: [PATCH 14/21] Add issues templates and Github action to remove stale issues We add 3 different issue types issue/question/RFE and add some predefined questions to speed up the debugging process. We also add a custom action which will close all issues create mode than 6 months ago which have not been updated for more than a month. --- .github/ISSUE_TEMPLATE/ISSUE.yaml | 77 +++++++++++++++++++++++++ .github/ISSUE_TEMPLATE/QUESTION.yaml | 15 +++++ .github/ISSUE_TEMPLATE/RFE.yaml | 22 +++++++ .github/ISSUE_TEMPLATE/config.yml | 1 + .github/workflows/close-old-issues.js | 76 ++++++++++++++++++++++++ .github/workflows/close_old_issues.yaml | 31 ++++++++++ 6 files changed, 222 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/ISSUE.yaml create mode 100644 .github/ISSUE_TEMPLATE/QUESTION.yaml create mode 100644 .github/ISSUE_TEMPLATE/RFE.yaml create mode 100644 .github/ISSUE_TEMPLATE/config.yml create mode 100644 .github/workflows/close-old-issues.js create mode 100644 .github/workflows/close_old_issues.yaml diff --git a/.github/ISSUE_TEMPLATE/ISSUE.yaml b/.github/ISSUE_TEMPLATE/ISSUE.yaml new file mode 100644 index 000000000..f760b305b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/ISSUE.yaml @@ -0,0 +1,77 @@ +name: NCCL issue or bug +description: Report an issue or failure when running NCCL code +title: "[Issue]: " +labels: ["triage"] + +body: + - type: markdown + attributes: + value: | + Thanks for reaching out! Before reporting a new issue, please feel free to search for the behavior in the existing issues. If you found an issue which is already closed or you are unsure, open a new issue and reference the old one from it. + You can also check out the [troubleshooting section](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html) in our user guide. + + --- + + To ensure we can assist you quickly and accurately, we often need the following information: + - type: dropdown + id: type + attributes: + label: How is this issue impacting you? + description: What best describes your issue? + options: + - Lower performance than expected + - Application crash + - Data corruption + - Application hang + validations: + required: true + + - type: textarea + id: log + attributes: + label: Share Your Debug Logs + description: | + + The logs and topo-files are a great tool to pin down issues. You can create them by setting these environment variables before the run. + * `NCCL_DEBUG=INFO` and `NCCL_DEBUG_FILE=ncclDebug.%h.%p` to produce one file per rank + * `NCCL_TOPO_DUMP_FILE=ncclSystem.txt` + + - type: textarea + id: repro + attributes: + label: Steps to Reproduce the Issue + description: | + * **Minimal Steps**: Please provide a simple way to recreate the issue (see [Minimal Bug Reports](https://matthewrocklin.com/minimal-bug-reports) for inspiration). + * **Environment Details**: Include software versions and relevant settings. + * **Intermittency**: Is this a sporadic issue? If so, how often does it occur? + * **Previous Success**: Did this work with an older NCCL version? + + The easier we can reproduce on our side the more likely we are to be able to solve it in a timely manner. + + - type: input + id: nccl_version + attributes: + label: NCCL Version + description: | + NCCL reports its version string in the debug logs. + You can also determine the version if you know which library was used by running `strings libnccl.so | grep 'NCCL version'`. + placeholder: "e.g. 2.27.1+cuda12.8" + validations: + required: true + + - type: textarea + id: platform + attributes: + label: Your platform details + description: | + * **GPU & Network**: Share your architecture and topology (e.g., from `nvidia-smi`, `nvidia-smi topo -m`, `ibstatus`). + * **Environment**: Bare-metal, containers, or cloud? + * **Scalability**: Does this issue occur with a specific number of ranks/nodes? + + - type: textarea + id: issue-description + attributes: + label: Error Message & Behavior + description: | + * **First Error**: What was the initial `NCCL WARN` message in your logs? + * **Expected vs. Actual**: Briefly describe the anticipated behavior versus what you're seeing. diff --git a/.github/ISSUE_TEMPLATE/QUESTION.yaml b/.github/ISSUE_TEMPLATE/QUESTION.yaml new file mode 100644 index 000000000..60e43489f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/QUESTION.yaml @@ -0,0 +1,15 @@ +name: NCCL question +description: Ask the NCCL team a question +title: "[Question]: " +labels: ["question"] + +body: + - type: markdown + attributes: + value: | + Thanks for reaching out! To solve your problem, feel free to check out the [user guide](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html), in particular the troubleshooting section, and also the [release notes](https://docs.nvidia.com/deeplearning/nccl/release-notes/index.html). + --- + - type: textarea + id: question + attributes: + label: Question diff --git a/.github/ISSUE_TEMPLATE/RFE.yaml b/.github/ISSUE_TEMPLATE/RFE.yaml new file mode 100644 index 000000000..7a305abfa --- /dev/null +++ b/.github/ISSUE_TEMPLATE/RFE.yaml @@ -0,0 +1,22 @@ +name: NCCL request for enhancement +description: Request for enhancement +title: "[RFE]: " +labels: ["enhancement"] +body: + - type: markdown + attributes: + value: | + + Thanks for your feedback! Before reporting a new RFE you could quickly check if this already exists in our [existing requests](https://github.com/NVIDIA/nccl/issues?q=sort%3Aupdated-desc%20is%3Aissue%20is%3Aopen%20label%3Aenhancement). + + --- + - type: textarea + id: rfe-description + attributes: + label: Please provide the below details to ensure we understand your needs + description: | + * What is the goal of this request? + * Who will benefit from this feature? + * Is this request for a specific GPU architecture or network infrastructure? + * How will this feature improve current workflows or processes? + * What is the priority level of this request? diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 000000000..3ba13e0ce --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: false diff --git a/.github/workflows/close-old-issues.js b/.github/workflows/close-old-issues.js new file mode 100644 index 000000000..9605aa023 --- /dev/null +++ b/.github/workflows/close-old-issues.js @@ -0,0 +1,76 @@ +const { Octokit } = require("@octokit/rest"); + +const octokit = new Octokit({ auth: process.env.GITHUB_TOKEN }); + +const owner = process.env.REPO_OWNER; +const repo = process.env.REPO_NAME.split('/').pop(); // Handles owner/repo format + +const now = new Date(); +const sixMonthsAgo = new Date(now); +sixMonthsAgo.setMonth(now.getMonth() - 6); +const oneMonthAgo = new Date(now); +oneMonthAgo.setMonth(now.getMonth() - 1); + +async function closeOldIssues() { + let page = 1; + let closedCount = 0; + + // write a multiline comment into a variable: + let body = `### Issue Cleanup: Helping Us Focus on Current Challenges + +We're [reviewing](https://github.com/NVIDIA/nccl/discussions/1761) older issues to ensure we prioritize the most relevant and active ones. Since this issue hasn't seen updates in over 6 months, we'll be closing it for now. + +*This change helps us focus our efforts on addressing any current issues our users are facing.* If this issue still affects you, please don't hesitate to reopen it with a quick update (e.g., \"Still relevant on [version=X]\"). +Thanks for your understanding and for contributing to NCCL.`; + + while (true) { + const { data: issues } = await octokit.issues.listForRepo({ + owner, + repo, + state: "open", + per_page: 100, + page, + }); + + if (issues.length === 0) break; + + for (const issue of issues) { + // Ignore PRs + if (issue.pull_request) continue; + + const createdAt = new Date(issue.created_at); + const updatedAt = new Date(issue.updated_at); + + if (createdAt < sixMonthsAgo && updatedAt < oneMonthAgo) { + + // Add a comment before closing + await octokit.issues.createComment({ + owner, + repo, + issue_number: issue.number, + body: body, + }); + + await octokit.issues.update({ + owner, + repo, + issue_number: issue.number, + state: "closed", + state_reason: "not_planned", + }); + closedCount++; + console.log(`Closed issue #${issue.number}`); + + // Break out if we have closed 100 issues + if (closedCount >= 100) { + console.log("Closed 100 issues, stopping."); + return; + } + } + } + page++; + } + console.log(`Total closed: ${closedCount}`); +} + +closeOldIssues().catch(console.error); diff --git a/.github/workflows/close_old_issues.yaml b/.github/workflows/close_old_issues.yaml new file mode 100644 index 000000000..15d81cb54 --- /dev/null +++ b/.github/workflows/close_old_issues.yaml @@ -0,0 +1,31 @@ +name: Close Old Issues + +on: + schedule: + - cron: '30 2 * * *' # Runs daily at 02:30 UTC + workflow_dispatch: + +permissions: + issues: write + +jobs: + close-old-issues: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: 20 + + - name: Install dependencies + run: npm install @octokit/rest@22.0.0 + + - name: Run close-old-issues script + run: node .github/workflows/close-old-issues.js + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO_OWNER: ${{ github.repository_owner }} + REPO_NAME: ${{ github.event.repository.name || github.repository }} From 0d1ece2b43ba1d85c76746ce63505f6db6b6b2f4 Mon Sep 17 00:00:00 2001 From: Stephen Sachs Date: Thu, 17 Jul 2025 21:50:05 +0200 Subject: [PATCH 15/21] Exclude ongoing issues from auto-closing logic - Added a check to skip issues labeled "ongoing" in the close-old-issues script - Adjusted the condition to compare both creation and update dates against six months ago --- .github/workflows/close-old-issues.js | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/close-old-issues.js b/.github/workflows/close-old-issues.js index 9605aa023..57e110339 100644 --- a/.github/workflows/close-old-issues.js +++ b/.github/workflows/close-old-issues.js @@ -38,10 +38,13 @@ Thanks for your understanding and for contributing to NCCL.`; // Ignore PRs if (issue.pull_request) continue; + // Ignore issues with label "ongoing" + if (issue.labels.some(label => label.name === "ongoing")) continue; + const createdAt = new Date(issue.created_at); const updatedAt = new Date(issue.updated_at); - if (createdAt < sixMonthsAgo && updatedAt < oneMonthAgo) { + if (createdAt < sixMonthsAgo && updatedAt < sixMonthsAgo) { // Add a comment before closing await octokit.issues.createComment({ From 593de54e52679b51428571c13271e2ea9f91b1b1 Mon Sep 17 00:00:00 2001 From: Kamil Iskra Date: Thu, 24 Jul 2025 10:39:53 -0700 Subject: [PATCH 16/21] NCCL 2.27.7-1 Prevent initialization failures in certain configurations when attempting to load fp8-specific symmetric multicast kernels on GPUs older than Blackwell. --- ext-tuner/README.md | 182 +++++++++++++++++++++++++++++++++ ext-tuner/basic/README.md | 197 ++++++++++++++++++++++++++++++++++++ ext-tuner/example/README.md | 3 +- makefiles/version.mk | 2 +- src/enqueue.cc | 5 +- 5 files changed, 382 insertions(+), 7 deletions(-) create mode 100644 ext-tuner/README.md create mode 100644 ext-tuner/basic/README.md diff --git a/ext-tuner/README.md b/ext-tuner/README.md new file mode 100644 index 000000000..67a743a12 --- /dev/null +++ b/ext-tuner/README.md @@ -0,0 +1,182 @@ +# NCCL Tuner Plugin Development + +This directory contains resources and examples for developing NCCL tuner plugins. Tuner plugins allow you to customize NCCL's algorithm and protocol selection behavior to optimize performance for specific workloads and hardware configurations. + +## Overview + +NCCL tuner plugins provide a way to influence NCCL's automatic algorithm and protocol selection by modifying the cost tables that NCCL uses to make decisions. This allows you to: + +- Override default algorithm/protocol combinations for specific collective operations +- Customize tuning based on message size, topology, and other parameters +- Implement sophisticated tuning strategies without recompiling NCCL +- Optimize performance for specific hardware configurations or workloads + +## Tuner Plugin Interface + +NCCL tuner plugins must implement the `ncclTuner_t` interface defined in `nccl_tuner.h` within `nccl/src/include/plugin`. These definitions have been forked to `tuner.h` in each example plugin, and it is expected that any plugin implementor forks the internal NCCL definitions as well. The current interface includes: + +```c +// Initialize the tuner plugin +ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); + +// Get and modify collective operation cost information +ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, + int numPipeOps, float** collCostTable, int numAlgo, int numProto, + int regBuff, int* nChannels); + +// Clean up plugin resources +ncclResult_t (*destroy)(void* context); +``` + +## Development Guidelines + +### 1. Plugin Structure + +A typical tuner plugin should: +- Include the necessary forked NCCL headers (`tuner.h`) +- Implement all required interface functions +- Export the plugin structure with appropriate version +- Handle all input parameters gracefully + +### 2. Cost Table Modification + +The `getCollInfo` function receives a cost table that maps algorithm/protocol combinations to performance costs. Lower costs indicate preferred combinations. You can: + +- Set costs to `0.0` to make combinations highly preferred +- Set costs to `NCCL_ALGO_PROTO_IGNORE` to disable combinations +- Use relative costs to create preferences between options + +### 3. Channel Management + +The `nChannels` parameter allows you to: +- Set a specific number of channels to use +- Return the original value to preserve NCCL's default behavior +- Implement dynamic channel selection based on message size or topology + +### 4. Error Handling + +Always return appropriate `ncclResult_t` values: +- `ncclSuccess` for successful or ignored operations +- `ncclInternalError` for plugin-specific errors. Returning an error is only advisable on plugin initialization and destruction, as the penalty users can pay for the overhead of a failed plugin call can be immense. +- Other NCCL error codes as appropriate + +## Getting Started + +### Option 1: Start with the Example Plugin + +If you're new to tuner plugin development, start with the `example/` directory: + +```bash +cd example/ +make +``` + +This provides a CSV-based configuration system that you can customize or use as a template. + +## Building and Testing + +### Build Requirements + +- GCC or compatible C compiler +- NCCL headers (included in `nccl/` subdirectories) +- Make + +## Option 2: Use the Basic Plugin + +For more customized tuning needs, you might want to start with a clean baseline. In that case, base off the basic plugin in the `basic/` directory: + +```bash +cd basic/ +make +``` + +### Build Process + +Each plugin directory contains a Makefile: + +```bash +cd basic/ # or example/ +make +``` + +This generates a shared library (`.so` file) that can be loaded by NCCL. + +### Loading the Plugin + +Set the `LD_LIBRARY_PATH` to include your plugin directory: + +```bash +export LD_LIBRARY_PATH=/path/to/your/plugin:$LD_LIBRARY_PATH +``` + +Set `NCCL_TUNER_PLUGIN` to either the plugin name, or the absolute path to the plugin file. Any of the below can work: + +```bash +export NCCL_TUNER_PLUGIN=example +export NCCL_TUNER_PLUGIN=libnccl-tuner-example.so +export NCCL_TUNER_PLUGIN=/path/to/your/plugin/libnccl-tuner-example.so +``` + +NCCL will automatically discover and load the plugin based on the exported symbol names. + +## Advanced Topics + +### Plugin Versioning + +NCCL supports multiple plugin interface versions. Make sure your plugin exports the correct version: + +```c +const ncclTuner_v4_t ncclTunerPlugin_v4 = { + .name = "YourPluginName", + .init = yourInitFunction, + .getCollInfo = yourGetCollInfoFunction, + .destroy = yourDestroyFunction +}; +``` + +### Multi-GPU and Multi-Node Considerations + +Your plugin receives topology information (`nRanks`, `nNodes`) during initialization. Use this to: +- Implement topology-aware tuning strategies +- Handle single-node vs. multi-node optimizations differently +- Scale channel counts based on available hardware + +### Performance Optimization + +- Keep plugin logic lightweight to avoid impacting NCCL performance +- Cache expensive computations when possible +- Use the logging system for debugging but avoid excessive output in production + +## Debugging and Logging + +Use NCCL's debug logging system: + +```bash +export NCCL_DEBUG=INFO # General information +export NCCL_DEBUG_SUBSYS=TUNING +``` + +Within your plugin, use the provided `ncclDebugLogger_t` function for consistent logging. + +## Best Practices + +1. **Test thoroughly**: Verify your plugin works with various message sizes and topologies +2. **Handle edge cases**: Ensure your plugin behaves correctly with unusual input parameters +3. **Document your approach**: Clearly document your tuning strategy and configuration options +4. **Version your plugin**: Use meaningful version numbers and maintain backward compatibility +5. **Performance validation**: Measure the impact of your tuning decisions on real workloads + +## Contributing + +When developing new tuner plugins: +- Follow the existing code style and structure +- Include comprehensive documentation +- Add example configurations and test cases +- Consider contributing useful plugins back to the community + +## Resources + +- [NCCL Documentation](https://docs.nvidia.com/deeplearning/nccl/) +- Example plugin implementations in this directory + +For questions and support, refer to the NCCL community resources and documentation. \ No newline at end of file diff --git a/ext-tuner/basic/README.md b/ext-tuner/basic/README.md new file mode 100644 index 000000000..acc6d5545 --- /dev/null +++ b/ext-tuner/basic/README.md @@ -0,0 +1,197 @@ +# Basic NCCL Tuner Plugin + +This directory contains a minimal placeholder implementation of an NCCL tuner plugin. It serves as a starting point for developing custom tuner plugins by providing the essential function stubs and interface structure required by NCCL. + +## Purpose + +This basic plugin is designed to: +- Provide a minimal working example of the NCCL tuner plugin interface +- Serve as a template for developing custom tuner plugins +- Demonstrate the required function signatures and structure +- Implement placeholder functionality that can be extended + + +## Implementation Details + +The plugin implements the following functions: + +### `pluginInit` +```c +ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) +``` +- **Purpose**: Initialize the plugin with communicator information +- **Current Implementation**: Simple placeholder that returns success +- **Parameters**: + - `nRanks`: Total number of ranks in the communicator + - `nNodes`: Total number of nodes in the communicator + - `logFunction`: NCCL debug logging function + - `context`: Plugin context pointer (output) + +### `pluginGetCollInfo` +```c +ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes, + int numPipeOps, float** collCostTable, int numAlgo, int numProto, + int regBuff, int* nChannels) +``` +- **Purpose**: Modify cost tables for collective operations +- **Current Implementation**: + - Sets RING+SIMPLE algorithm to cost 0.0 (highest preference) + - Sets channel count to 1 +- **Parameters**: + - `context`: Plugin context from init + - `collType`: Type of collective operation + - `nBytes`: Message size in bytes + - `numPipeOps`: Number of pipeline operations + - `collCostTable`: Cost table to modify + - `numAlgo`: Number of algorithms + - `numProto`: Number of protocols + - `regBuff`: Whether buffer can be registered + - `nChannels`: Number of channels to use (output) + +### `pluginDestroy` +```c +ncclResult_t pluginDestroy(void* context) +``` +- **Purpose**: Clean up plugin resources +- **Current Implementation**: Simple placeholder that returns success + +## Cost Table Structure + +The plugin demonstrates how to modify NCCL's cost tables: + +```c +float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable; +``` + +The cost table is a 2D array where: +- First dimension: Algorithm index (e.g., `NCCL_ALGO_RING`) +- Second dimension: Protocol index (e.g., `NCCL_PROTO_SIMPLE`) +- Values: Cost for that algorithm/protocol combination + +### Cost Values +- **0.0**: Highest preference (lowest cost) +- **Positive values**: Relative costs (lower is better) +- **`NCCL_ALGO_PROTO_IGNORE`**: Disable this combination + +## Building + +```bash +make +``` + +This creates `libnccl-tuner-basic.so` which can be loaded by NCCL. + +## Usage + +### Loading the Plugin + +```bash +export LD_LIBRARY_PATH=/path/to/basic:$LD_LIBRARY_PATH +mpirun -np 4 your_nccl_application +``` + +```bash +export NCCL_TUNER_PLUGIN=basic +export NCCL_TUNER_PLUGIN=libnccl-tuner-basic.so +export NCCL_TUNER_PLUGIN=/path/to/your/plugin/libnccl-tuner-basic.so +``` + +### Verifying Plugin Loading + +Enable NCCL debug output to see if the plugin is loaded: + +```bash +export NCCL_DEBUG=INFO +``` + +You should see messages indicating the tuner plugin is being used. + +## Extending the Plugin + +This basic plugin provides a foundation that you can extend: + +### 1. Add Configuration Logic + +Modify `pluginGetCollInfo` to implement your tuning strategy: + +```c +__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes, + int numPipeOps, float** collCostTable, int numAlgo, int numProto, + int regBuff, int* nChannels) { + // Your custom tuning logic here + if (nBytes < 1024) { + // Small message optimization + table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = 0.0; + } else { + // Large message optimization + table[NCCL_ALGO_RING][NCCL_PROTO_LL128] = 0.0; + } + + // Dynamic channel selection + *nChannels = (nBytes > 1024*1024) ? 4 : 1; + + return ncclSuccess; +} +``` + +### 2. Add Context Management + +Use the context pointer to store plugin state: + +```c +struct pluginContext { + int initialized; + size_t nRanks; + size_t nNodes; + // Add your plugin-specific data here +}; +``` + +### 3. Add File-Based Configuration + +Read configuration from files, environment variables, or other sources. + +### 4. Add Topology Awareness + +Use the `nRanks` and `nNodes` parameters to implement topology-specific tuning. + +## File Structure + +``` +basic/ +├── README.md # This file +├── plugin.c # Plugin implementation +├── Makefile # Build configuration +└── nccl/ # NCCL header files + └── tuner.h # Tuner plugin interface definitions +``` + +## Next Steps + +1. **Understand the Interface**: Study the function signatures and parameters +2. **Implement Your Logic**: Add your tuning strategy to `pluginGetCollInfo` +3. **Test Thoroughly**: Verify your plugin works with different message sizes and topologies +4. **Add Error Handling**: Implement proper error checking and resource management +5. **Document Your Changes**: Update this README with your specific implementation details + +## Comparison with Example Plugin + +- **Basic Plugin**: Minimal implementation, good for learning and simple use cases +- **Example Plugin**: Full-featured CSV-based configuration system, good for production use + +Choose the basic plugin if you want to: +- Learn the tuner plugin interface +- Implement simple, hardcoded tuning strategies +- Build a custom plugin from scratch + +Choose the example plugin if you want: +- File-based configuration +- Complex tuning strategies +- Production-ready features + +## Resources + +- [Parent Directory README](../README.md) - General tuner plugin development guide +- [Example Plugin](../example/README.md) - Fully featured implementation + +This basic plugin provides the foundation you need to start developing custom NCCL tuner plugins. Extend it with your specific tuning logic and requirements. diff --git a/ext-tuner/example/README.md b/ext-tuner/example/README.md index 7f472ae7a..10a99b5f2 100644 --- a/ext-tuner/example/README.md +++ b/ext-tuner/example/README.md @@ -104,7 +104,6 @@ Set the `NCCL_TUNER_CONFIG_FILE` environment variable to specify the config file ```bash export NCCL_TUNER_CONFIG_FILE=/path/to/your/tuner.conf -export LD_LIBRARY_PATH=/path/to/plugin:$LD_LIBRARY_PATH mpirun -np 4 your_nccl_application ``` @@ -158,7 +157,7 @@ When channels is set to `-1`, NCCL's default channel selection logic is preserve 1. **Config file not found**: Check the file path and permissions 2. **Configurations not applied**: Verify the collective type, size ranges, algorithm/protocol names, and topology parameters -3. **Plugin not loaded**: Ensure `LD_LIBRARY_PATH` includes the plugin directory +3. **Plugin not loaded**: Ensure `LD_LIBRARY_PATH` includes the plugin directory and that `NCCL_TUNER_PLUGIN` either specifies the plugin name, or an absolute path to the plugin shared library. 4. **No effect on performance**: Check that NCCL is actually using the tuner plugin with `NCCL_DEBUG=INFO` 5. **Topology mismatch**: Verify that nNodes and nRanks match your actual setup, or use -1 for wildcards 6. **CSV parsing errors**: Ensure no spaces after commas, or quote fields containing spaces diff --git a/makefiles/version.mk b/makefiles/version.mk index 0f482d31a..3b182d61b 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 27 -NCCL_PATCH := 6 +NCCL_PATCH := 7 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/enqueue.cc b/src/enqueue.cc index f5b43724c..225a4cffc 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -38,12 +38,9 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma if (fn == nullptr) continue; cudaError_t errcode = cudaFuncGetAttributes(&attr, fn); - if (errcode == cudaErrorNoKernelImageForDevice) continue; - CUDACHECKGOTO(errcode, result, ignore0); - + if (errcode != cudaSuccess) continue; // Silently ignore failures if (maxStackSize) { if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes; - ignore0:; } if (carveout) { CUDACHECKGOTO(cudaFuncSetAttribute(fn, From f1308997d0420148b1be1c24d63f19d902ae589b Mon Sep 17 00:00:00 2001 From: Mark Santesson Date: Tue, 2 Sep 2025 13:21:14 -0700 Subject: [PATCH 17/21] NCCL 2.28.3-1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Device API (Experimental) * Introduces device-side APIs to integrate NCCL communication directly into application kernels. * Supports LSA (Load/Store Access) for CUDA P2P communication over NVLink and some PCIe platforms. * Supports Multimem for hardware multicast using NVLink SHARP. * Adds initial framework for GIN (GPU-Initiated Networking), currently under development. * Introduces device communicators created using ncclDevCommCreate. * Enables device-side communication operations with synchronization (ncclLsaBarrierSession) and memory accessors (ncclGetLsaPointer, ncclGetLsaMultimemPointer). * Experimental APIs - signatures and functionality may evolve in future releases. * No ABI compatibility is guaranteed — applications must be recompiled with each new NCCL release. Symmetric memory improvements * Support for aggregating symmetric operations using ncclGroupStart/End APIs. * Reimplement symmetric kernels using device API. New Host APIs * Introduce new host collective APIs: ncclAlltoAll, ncclScatter, ncclGather. CE (Copy Engine) Collectives * Reduce SM utilization for alltoall, scatter, gather, and allgather within a single (MN)NVL domain. * Free up SM capacity for the application to do computation at the same time. * To enable the feature for ncclAllGather, ncclAlltoAll, ncclGather, ncclScatter, register buffers into symmetric windows and use the NCCL_CTA_POLICY_ZERO flag in the communicator config_t. NCCL Inspector Plugin * Introduces an Inspector plugin for always-on performance monitoring. * Produces structured JSON output with metadata, execution time, bandwidth, and optional event traces for each NCCL operation. * Enables integration with analysis tools such as Performance Exporter to visualize NCCL performance bottlenecks. * Lightweight to enable via environment variables NCCL_PROFILER_PLUGIN and NCCL_INSPECTOR_ENABLE. CMake support (Experiemental) * Adds a CMake build system as an alternative to existing Makefiles. * Known issues: pkg.build and Device API currently do not work with CMake. * The known issues will be addressed in a future release. Decreased max CTA count from 32 to 16 on Blackwell * SM overhead is decreased by 50% with this improvement. * This may cause some perf drop on Blackwell because of the reduced SM usage. * If the extra SM capacity is not desired, two options are available to restore to previous behavior: 1) Setting NCCL_MIN_CTAS=32 NCCL_MAX_CTAS=32 environment variables; 2) setting communicator config to over-write max CTA count to 32. * Based on community feedback, future versions may consider different trade-offs between performance and SM overhead. Plugins * Network * App-aware Network plugin. NCCL passes information about communication operations to be executed on the network end point. This allows for better tuning of network end points and their use in the plugins. * Improve handling of physical and virtual network devices and load/unload. * Network plugin version 11 - add explicit context and communication ID support for per communicator init/finalize. * Add Multi-Request Net API. Using this will help NCCL to anticipate multiple send/recv requests and optimize for it. See maxMultiRequestSize field in ncclNetProperties_v11_t. * Profiler * Add support for API events (group, collective, and p2p) and for tracking kernel launches in the profiler plugin. * Add Inspector Profiler Plugin (see section above). * Add a hook to Google’s CoMMA profiler on github. * Tuner * Expose NCCL tuning constants at tuner initialization via ncclTunerConstants_v5_t. * Add NVL Domain Information API. * Support multiple plugin types from a single shared object. New Parameterization and ncclConfig changes: * Add new option NCCL_MNNVL_CLIQUE_ID=-2 which will use rack serial number to partition the MNNVL clique. This will limit NVLink domains to GPUs within a single rack. * Add NCCL_NETDEVS_POLICY to control how NET devices are assigned to GPUs. The default (AUTO) is the policy used in previous versions. * Add NCCL_SINGLE_PROC_MEM_REG_ENABLE control variable to enable NVLS UB registration in the “one process, multiple ranks” case as opt in. * Move nChannelsPerNetPeer into ncclConfig. NCCL_NCHANNELS_PER_NET_PEER can override the value in ncclConfig. * Enable PxN over C2C by default * PxN over C2C will improve performance for Grace-Blackwell platforms by allowing NCCL to leverage the NIC attached to a peer GPU over NVLINK, C2C, and PCIe. * This behavior can be overridden by setting NCCL_PXN_C2C=0. Other Improvements: * Allow FP8 support for non-reductive operations on pre sm90 devices. (See https://github.com/pytorch/pytorch/pull/151594#discussion_r2135777776) * Fix NVLS+CollNet and temporarily disables COLLNET_CHAIN for >8 GPUs. * Only consider running interfaces for socket traffic. NCCL will not attempt to use interfaces that do not have the IFF_RUNNING bit. (https://github.com/NVIDIA/nccl/issues/1798) * Modernize mutex management. Convert to std::mutex and std::lock_guard. * Remove sm35 and sm50 GENCODE targets which have long been deprecated and were causing issues with the latest NCCL release builds. * Improved NVLS/NVLSTree tuning prediction to improve algorithm and protocol selection. * NVLSTree Tuning Fixes. Update tuning data for H100, GB200-NV72. * Respond better to RoCE link flaps. Instead of reporting an “unknown event” it will now report “GID table changed”. * Move libvirt bridge interface to the end of possible interfaces so that they are considered last. These interfaces are usually virtual bridges to relay traffic to containers running on the host and cannot be used for traffic to a remote node and are therefore unsuitable. --- ext-net/README.md | 25 +- ext-net/example/CMakeLists.txt | 19 + ext-net/example/nccl/net.h | 10 +- ext-net/example/nccl/net_device.h | 5 +- ext-net/example/nccl/net_v10.h | 3 +- ext-net/example/nccl/net_v11.h | 120 ++ ext-net/example/nccl/net_v9.h | 3 +- ext-net/example/plugin.c | 101 +- ext-profiler/README.md | 119 +- ext-profiler/example/CMakeLists.txt | 34 + ext-profiler/example/Makefile | 16 +- ext-profiler/example/README.md | 180 +- ext-profiler/example/event.c | 30 - ext-profiler/example/event.h | 155 +- ext-profiler/example/nccl/profiler.h | 33 +- ext-profiler/example/nccl/profiler_v5.h | 152 ++ ext-profiler/example/{plugin.c => plugin.cc} | 264 ++- ext-profiler/example/plugin.h | 5 +- .../example/{print_event.c => print_event.cc} | 99 +- ext-profiler/example/queue.h | 50 + ext-profiler/google-CoMMA/Makefile | 22 + ext-profiler/inspector/Makefile | 62 + ext-profiler/inspector/README.md | 216 +++ .../inspector/exporter/example/README.md | 151 ++ .../exporter/example/perf_summary_exporter.py | 548 ++++++ .../exporter/example/requirements.txt | 6 + ext-profiler/inspector/inspector.cc | 1530 +++++++++++++++++ ext-profiler/inspector/inspector.h | 198 +++ ext-profiler/inspector/inspector_plugin.cc | 493 ++++++ ext-profiler/inspector/json.cc | 496 ++++++ ext-profiler/inspector/json.h | 83 + ext-profiler/inspector/nccl/common.h | 73 + ext-profiler/inspector/nccl/profiler.h | 85 + ext-profiler/inspector/nccl/profiler_net.h | 19 + ext-profiler/inspector/nccl/profiler_v1.h | 112 ++ ext-profiler/inspector/nccl/profiler_v2.h | 108 ++ ext-profiler/inspector/nccl/profiler_v3.h | 116 ++ ext-profiler/inspector/nccl/profiler_v4.h | 127 ++ ext-profiler/inspector/nccl/profiler_v5.h | 151 ++ ext-profiler/inspector/nccl/types.h | 21 + ext-profiler/inspector/version.h | 12 + ext-tuner/README.md | 2 +- ext-tuner/example/.gitignore | 49 + ext-tuner/example/CMakeLists.txt | 26 + ext-tuner/example/nccl/tuner.h | 51 +- ext-tuner/example/plugin.c | 36 +- ext-tuner/example/test/test_plugin.c | 178 +- makefiles/common.mk | 7 +- makefiles/version.mk | 4 +- pkg/Makefile | 2 +- pkg/debian/libnccl-dev.install.in | 2 +- pkg/redhat/nccl.spec.in | 4 +- pkg/srctxz/Makefile | 2 +- pkg/srctxz/create_srctxz.sh.in | 28 +- src/CMakeLists.txt | 180 ++ src/Makefile | 20 +- src/allocator.cc | 396 ++++- src/bootstrap.cc | 25 +- src/ce_coll.cc | 615 +++++++ src/collectives.cc | 42 + src/debug.cc | 46 +- src/dev_runtime.cc | 995 +++++++++++ src/device/CMakeLists.txt | 60 + src/device/Makefile | 8 +- src/device/common.h | 14 +- src/device/generate.py | 18 +- src/device/symmetric/all_gather.cuh | 260 +-- src/device/symmetric/all_reduce.cuh | 353 ++-- src/device/symmetric/generate.py | 62 +- src/device/symmetric/kernel.cuh | 24 +- src/device/symmetric/primitives.cuh | 453 +---- src/device/symmetric/reduce_scatter.cuh | 265 +-- src/enqueue.cc | 590 ++++--- src/graph/CMakeLists.txt | 14 + src/graph/connect.cc | 40 +- src/graph/paths.cc | 15 +- src/graph/topo.cc | 375 ++-- src/graph/topo.h | 24 +- src/graph/tuning.cc | 175 +- src/graph/xml.cc | 44 +- src/graph/xml.h | 7 + src/group.cc | 213 ++- src/include/allocator.h | 52 +- src/include/bitops.h | 29 +- src/include/ce_coll.h | 76 + src/include/channel.h | 7 +- src/include/coll_net.h | 3 +- src/include/collectives.h | 8 +- src/include/comm.h | 55 +- src/include/core.h | 1 + src/include/cpuset.h | 95 +- src/include/cudawrap.h | 6 + src/include/debug.h | 26 +- src/include/dev_runtime.h | 92 + src/include/device.h | 24 +- src/include/graph.h | 2 + src/include/group.h | 1 + src/include/nccl_common.h | 26 +- src/include/nccl_device.h | 15 + src/include/nccl_device/README.md | 32 + src/include/nccl_device/comm.h | 10 + src/include/nccl_device/coop.h | 152 ++ src/include/nccl_device/core.h | 150 ++ src/include/nccl_device/impl/comm__funcs.h | 10 + src/include/nccl_device/impl/comm__types.h | 40 + src/include/nccl_device/impl/core__funcs.h | 210 +++ src/include/nccl_device/impl/core__types.h | 26 + src/include/nccl_device/impl/ll_a2a__funcs.h | 229 +++ src/include/nccl_device/impl/ll_a2a__types.h | 37 + .../nccl_device/impl/mem_barrier__funcs.h | 126 ++ .../nccl_device/impl/mem_barrier__types.h | 46 + src/include/nccl_device/impl/ptr__funcs.h | 157 ++ src/include/nccl_device/impl/ptr__types.h | 11 + src/include/nccl_device/ll_a2a.h | 53 + src/include/nccl_device/mem_barrier.h | 35 + src/include/nccl_device/ptr.h | 61 + src/include/nccl_device/utility.h | 352 ++++ src/include/net.h | 6 + src/include/net_device.h | 5 +- src/include/nvmlwrap.h | 21 + src/include/nvtx.h | 108 +- src/include/nvtx3/nvToolsExtCounters.h | 2 +- .../nvtx3/nvToolsExtSemanticsCounters.h | 2 +- src/include/nvtx3/nvToolsExtSemanticsScope.h | 2 +- .../nvtx3/nvtxDetail/nvtxExtHelperMacros.h | 2 +- src/include/nvtx3/nvtxDetail/nvtxExtImpl.h | 2 +- .../nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h | 2 +- .../nvtxDetail/nvtxExtPayloadHelperInternal.h | 2 +- .../nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h | 2 +- src/include/nvtx3/nvtxDetail/nvtxExtTypes.h | 2 +- src/include/nvtx_payload_schemas.h | 23 + src/include/plugin/nccl_net.h | 23 +- src/include/plugin/nccl_profiler.h | 32 +- src/include/plugin/nccl_tuner.h | 41 +- src/include/plugin/net/net_v10.h | 4 +- src/include/plugin/net/net_v11.h | 188 ++ src/include/plugin/net/net_v9.h | 4 +- src/include/plugin/plugin.h | 2 + src/include/plugin/profiler/profiler_v5.h | 151 ++ src/include/plugin/tuner/tuner_v5.h | 87 + src/include/profiler.h | 36 + src/include/proxy.h | 11 + src/include/register.h | 24 +- src/include/register_inline.h | 17 +- src/include/scheduler.h | 17 + src/include/shm.h | 6 + src/include/shmutils.h | 4 +- src/include/sym_kernels.h | 112 ++ src/include/symmetric.h | 90 - src/include/transport.h | 12 +- src/include/utils.h | 2 + src/init.cc | 201 ++- src/init_nvtx.cc | 13 + src/misc/CMakeLists.txt | 20 + src/misc/cudawrap.cc | 17 +- src/misc/gdrwrap.cc | 5 +- src/misc/ibvwrap.cc | 5 +- src/misc/mlx5dvwrap.cc | 30 +- src/misc/nvmlwrap.cc | 17 + src/misc/param.cc | 10 +- src/misc/shmutils.cc | 45 +- src/misc/socket.cc | 8 +- src/misc/strongstream.cc | 9 +- src/misc/utils.cc | 30 +- src/mnnvl.cc | 6 +- src/nccl.h.in | 59 +- src/nccl_device/CMakeLists.txt | 9 + src/nccl_device/core.cc | 57 + src/nccl_device/ll_a2a.cc | 26 + src/nccl_device/mem_barrier.cc | 21 + src/plugin/CMakeLists.txt | 18 + src/plugin/net.cc | 125 +- src/plugin/net/CMakeLists.txt | 12 + src/plugin/net/net_v10.cc | 187 +- src/plugin/net/net_v11.cc | 31 + src/plugin/net/net_v6.cc | 68 +- src/plugin/net/net_v7.cc | 67 +- src/plugin/net/net_v8.cc | 67 +- src/plugin/net/net_v9.cc | 117 +- src/plugin/plugin_open.cc | 66 +- src/plugin/profiler.cc | 297 +++- src/plugin/profiler/CMakeLists.txt | 11 + src/plugin/profiler/profiler_v1.cc | 16 +- src/plugin/profiler/profiler_v2.cc | 16 +- src/plugin/profiler/profiler_v3.cc | 16 +- src/plugin/profiler/profiler_v4.cc | 104 +- src/plugin/profiler/profiler_v5.cc | 21 + src/plugin/tuner.cc | 30 +- src/plugin/tuner/CMakeLists.txt | 10 + src/plugin/tuner/tuner_v2.cc | 14 +- src/plugin/tuner/tuner_v3.cc | 12 +- src/plugin/tuner/tuner_v4.cc | 22 +- src/plugin/tuner/tuner_v5.cc | 21 + src/proxy.cc | 68 +- src/ras/CMakeLists.txt | 11 + src/ras/ras.cc | 8 +- src/register/CMakeLists.txt | 9 + src/register/coll_reg.cc | 9 +- src/register/register.cc | 113 -- src/register/sendrecv_reg.cc | 6 + src/scheduler/CMakeLists.txt | 7 + src/scheduler/symmetric_sched.cc | 235 +++ src/{symmetric.cc => sym_kernels.cc} | 179 +- src/transport/CMakeLists.txt | 15 + src/transport/coll_net.cc | 62 +- src/transport/generic.cc | 6 + src/transport/net.cc | 149 +- src/transport/net_ib.cc | 281 +-- src/transport/net_socket.cc | 70 +- src/transport/nvls.cc | 136 +- src/transport/p2p.cc | 90 +- src/transport/profiler.cc | 2 +- 212 files changed, 15532 insertions(+), 2935 deletions(-) create mode 100644 ext-net/example/CMakeLists.txt create mode 100644 ext-net/example/nccl/net_v11.h create mode 100644 ext-profiler/example/CMakeLists.txt delete mode 100644 ext-profiler/example/event.c create mode 100644 ext-profiler/example/nccl/profiler_v5.h rename ext-profiler/example/{plugin.c => plugin.cc} (68%) rename ext-profiler/example/{print_event.c => print_event.cc} (76%) create mode 100644 ext-profiler/example/queue.h create mode 100644 ext-profiler/google-CoMMA/Makefile create mode 100644 ext-profiler/inspector/Makefile create mode 100644 ext-profiler/inspector/README.md create mode 100644 ext-profiler/inspector/exporter/example/README.md create mode 100644 ext-profiler/inspector/exporter/example/perf_summary_exporter.py create mode 100644 ext-profiler/inspector/exporter/example/requirements.txt create mode 100644 ext-profiler/inspector/inspector.cc create mode 100644 ext-profiler/inspector/inspector.h create mode 100644 ext-profiler/inspector/inspector_plugin.cc create mode 100644 ext-profiler/inspector/json.cc create mode 100644 ext-profiler/inspector/json.h create mode 100644 ext-profiler/inspector/nccl/common.h create mode 100644 ext-profiler/inspector/nccl/profiler.h create mode 100644 ext-profiler/inspector/nccl/profiler_net.h create mode 100644 ext-profiler/inspector/nccl/profiler_v1.h create mode 100644 ext-profiler/inspector/nccl/profiler_v2.h create mode 100644 ext-profiler/inspector/nccl/profiler_v3.h create mode 100644 ext-profiler/inspector/nccl/profiler_v4.h create mode 100644 ext-profiler/inspector/nccl/profiler_v5.h create mode 100644 ext-profiler/inspector/nccl/types.h create mode 100644 ext-profiler/inspector/version.h create mode 100644 ext-tuner/example/.gitignore create mode 100644 ext-tuner/example/CMakeLists.txt create mode 100644 src/CMakeLists.txt create mode 100644 src/ce_coll.cc create mode 100644 src/dev_runtime.cc create mode 100644 src/device/CMakeLists.txt create mode 100644 src/graph/CMakeLists.txt create mode 100644 src/include/ce_coll.h create mode 100644 src/include/dev_runtime.h create mode 100644 src/include/nccl_device.h create mode 100644 src/include/nccl_device/README.md create mode 100644 src/include/nccl_device/comm.h create mode 100644 src/include/nccl_device/coop.h create mode 100644 src/include/nccl_device/core.h create mode 100644 src/include/nccl_device/impl/comm__funcs.h create mode 100644 src/include/nccl_device/impl/comm__types.h create mode 100644 src/include/nccl_device/impl/core__funcs.h create mode 100644 src/include/nccl_device/impl/core__types.h create mode 100644 src/include/nccl_device/impl/ll_a2a__funcs.h create mode 100644 src/include/nccl_device/impl/ll_a2a__types.h create mode 100644 src/include/nccl_device/impl/mem_barrier__funcs.h create mode 100644 src/include/nccl_device/impl/mem_barrier__types.h create mode 100644 src/include/nccl_device/impl/ptr__funcs.h create mode 100644 src/include/nccl_device/impl/ptr__types.h create mode 100644 src/include/nccl_device/ll_a2a.h create mode 100644 src/include/nccl_device/mem_barrier.h create mode 100644 src/include/nccl_device/ptr.h create mode 100644 src/include/nccl_device/utility.h create mode 100644 src/include/plugin/net/net_v11.h create mode 100644 src/include/plugin/profiler/profiler_v5.h create mode 100644 src/include/plugin/tuner/tuner_v5.h create mode 100644 src/include/scheduler.h create mode 100644 src/include/sym_kernels.h delete mode 100644 src/include/symmetric.h create mode 100644 src/misc/CMakeLists.txt create mode 100644 src/nccl_device/CMakeLists.txt create mode 100644 src/nccl_device/core.cc create mode 100644 src/nccl_device/ll_a2a.cc create mode 100644 src/nccl_device/mem_barrier.cc create mode 100644 src/plugin/CMakeLists.txt create mode 100644 src/plugin/net/CMakeLists.txt create mode 100644 src/plugin/net/net_v11.cc create mode 100644 src/plugin/profiler/CMakeLists.txt create mode 100644 src/plugin/profiler/profiler_v5.cc create mode 100644 src/plugin/tuner/CMakeLists.txt create mode 100644 src/plugin/tuner/tuner_v5.cc create mode 100644 src/ras/CMakeLists.txt create mode 100644 src/register/CMakeLists.txt create mode 100644 src/scheduler/CMakeLists.txt create mode 100644 src/scheduler/symmetric_sched.cc rename src/{symmetric.cc => sym_kernels.cc} (52%) create mode 100644 src/transport/CMakeLists.txt diff --git a/ext-net/README.md b/ext-net/README.md index 90fe89bf5..8bcaf3096 100644 --- a/ext-net/README.md +++ b/ext-net/README.md @@ -60,36 +60,36 @@ of newer ones. The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions from old API versions. It also provides error codes in `err.h`. -# API (v10) +# API (v11) -Below is the main `ncclNet_v10` struct. Each function is explained in later sections. +Below is the main `ncclNet_v11` struct. Each function is explained in later sections. ``` typedef struct { // Name of the network (mainly for logs) const char* name; // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction); + ncclResult_t (*init)(void** ctx, uint64_t commId, ncclNetCommConfig_v11_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction); // Return the number of adapters. ncclResult_t (*devices)(int* ndev); // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props); + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props); // Create a receiving object and provide a handle to connect to it. The // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged // between ranks to create a connection. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + ncclResult_t (*listen)(void* ctx, int dev, void* handle, void** listenComm); // Connect to a handle and return a sending comm object for that peer. // This call must not block for the connection to be established, and instead // should return successfully with sendComm == NULL with the expectation that // it will be called again until sendComm != NULL. // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection - ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm); + ncclResult_t (*connect)(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm); // Finalize connection establishment after remote peer has called connect. // This call must not block for the connection to be established, and instead // should return successfully with recvComm == NULL with the expectation that // it will be called again until recvComm != NULL. // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection - ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm); + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); @@ -191,6 +191,12 @@ This will allow the plugin to discover network devices and make sure they are us `init` function does not return `ncclSuccess`, then NCCL will not use the plugin and fall back on internal ones. +Every call to `init` returns an opaque context that the plugin uses internally to allocate resources +and manage state. Such context is passed to other net plugin calls that create further resources, +such as `listen` and `connect`. Every context is uniquely associated to a communicator +using the commId. The network can also be initialized with a per communicator configuration using +the `config` argument. + To allow the plugin logs to integrate into the NCCL logs seemlessly, NCCL provides a logging function to `init`. This function is typically used to allow for `INFO` and `WARN` macros within the plugin code adding the following definitions: @@ -282,7 +288,7 @@ side. `listen` To create a connection, NCCL will start by calling `listen` on the receiver side. This function -takes a device number as input argument, and should return a local `listenComm` object, and a +takes the opaque plugin context returned by `init` and a device number as input argument, and should return a local `listenComm` object, and a `handle` to pass to the other side, so that the sender side can connect to the receiver. The `handle` is a buffer of size `NCCL_NET_HANDLE_MAXSIZE` and is provided by NCCL. @@ -304,7 +310,8 @@ the `listen` call previously. If the sender did not connect yet, `accept` should should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it succeeds. -The `connect` API takes a `ncclNetCommConfig_t`, which contains a trafficClass field. +The `connect` API takes the opaque plugin context returned by `init`. The plugin context can reference +the `ncclNetCommConfig_t` passed to the `init` function and containing a trafficClass field. This field can be used by the network plugin to specify the QoS level of the connection. By default, `trafficClass` is set to -1 but can be configured by the application during communicator initialization to select a plugin-supported QoS level. diff --git a/ext-net/example/CMakeLists.txt b/ext-net/example/CMakeLists.txt new file mode 100644 index 000000000..d8af7fe36 --- /dev/null +++ b/ext-net/example/CMakeLists.txt @@ -0,0 +1,19 @@ +set(SRC_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/plugin.c +) + +# Create shared library +add_library(nccl-net-example SHARED ${SRC_FILES}) + +# Set include directories +target_include_directories(nccl-net-example PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/nccl +) + +# Set output name to match Makefile +set_target_properties(nccl-net-example PROPERTIES + OUTPUT_NAME "nccl-net-example" + PREFIX "lib" + POSITION_INDEPENDENT_CODE ON + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/test/unit/plugins +) diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h index 4cc66915b..9b3e6e03c 100644 --- a/ext-net/example/nccl/net.h +++ b/ext-net/example/nccl/net.h @@ -22,7 +22,9 @@ // Maximum number of requests per comm object #define NCCL_NET_MAX_REQUESTS 32 +#define NCCL_NET_MAX_DEVS_PER_NIC 4 +#include "net_v11.h" #include "net_v10.h" #include "net_v9.h" #include "net_v8.h" @@ -33,9 +35,9 @@ #include "net_v3.h" #include "net_v2.h" -typedef ncclNet_v10_t ncclNet_t; -typedef ncclNetProperties_v10_t ncclNetProperties_t; -typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t; -typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t; +typedef ncclNet_v11_t ncclNet_t; +typedef ncclNetProperties_v11_t ncclNetProperties_t; +typedef ncclNetVDeviceProps_v11_t ncclNetVDeviceProps_t; +typedef ncclNetCommConfig_v11_t ncclNetCommConfig_t; #endif // end include guard diff --git a/ext-net/example/nccl/net_device.h b/ext-net/example/nccl/net_device.h index d693101a3..56bcea83f 100644 --- a/ext-net/example/nccl/net_device.h +++ b/ext-net/example/nccl/net_device.h @@ -12,7 +12,7 @@ // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version. -#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7 +#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7 typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType; @@ -27,6 +27,7 @@ typedef struct { typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t; typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t; typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t; -typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t; +typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_v11_t; +typedef ncclNetDeviceHandle_v11_t ncclNetDeviceHandle_t; #endif diff --git a/ext-net/example/nccl/net_v10.h b/ext-net/example/nccl/net_v10.h index 809e7c001..bb0c661bb 100644 --- a/ext-net/example/nccl/net_v10.h +++ b/ext-net/example/nccl/net_v10.h @@ -5,10 +5,9 @@ #ifndef NET_V10_H_ #define NET_V10_H_ -#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4 typedef struct { int ndevs; - int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10]; + int devs[NCCL_NET_MAX_DEVS_PER_NIC]; } ncclNetVDeviceProps_v10_t; diff --git a/ext-net/example/nccl/net_v11.h b/ext-net/example/nccl/net_v11.h new file mode 100644 index 000000000..1c8adc6c5 --- /dev/null +++ b/ext-net/example/nccl/net_v11.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NET_V11_H_ +#define NET_V11_H_ + +typedef struct { + int ndevs; + int devs[NCCL_NET_MAX_DEVS_PER_NIC]; +} ncclNetVDeviceProps_v11_t; + +#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1 + +typedef struct { + // Plugin-specific TC value + int trafficClass; +} ncclNetCommConfig_v11_t; + + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int regIsGlobal; // regMr is not tied to a particular comm + int forceFlush; // Force a flush on receives + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload + ncclNetVDeviceProps_v11_t vProps; + size_t maxP2pBytes; // Max transfer size for point-to-point operations + size_t maxCollBytes; // Max transfer size for collective operations + int maxMultiRequestSize; // Maximum number of requests supported in a single multi-request. +} ncclNetProperties_v11_t; + +typedef struct { + int32_t maxConcurrentPeers; + int32_t minConcurrentPeers; + int32_t maxFlowsPerPeer; + int32_t minFlowsPerPeer; +} ncclNetCommAttr_v11_t; + +typedef struct { + ncclNetCommAttr_v11_t sendCommAttr; + ncclNetCommAttr_v11_t recvCommAttr; + uint32_t op; + uint32_t algo; + uint32_t proto; +} ncclNetAttr_v11_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(void** ctx, uint64_t commId, ncclNetCommConfig_v11_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(void* ctx, int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); + + // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller + // what index this new vNIC exists at + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v11_t* props); + // Finalize the network. + ncclResult_t (*finalize)(void* ctx); + + ncclResult_t (*setNetAttr)(void* ctx, ncclNetAttr_v11_t* netAttr); +} ncclNet_v11_t; + +#endif // end include guard diff --git a/ext-net/example/nccl/net_v9.h b/ext-net/example/nccl/net_v9.h index ca60ad651..9dea09cbd 100644 --- a/ext-net/example/nccl/net_v9.h +++ b/ext-net/example/nccl/net_v9.h @@ -5,10 +5,9 @@ #ifndef NET_V9_H_ #define NET_V9_H_ -#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4 typedef struct { int ndevs; - int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9]; + int devs[NCCL_NET_MAX_DEVS_PER_NIC]; } ncclNetVDeviceProps_v9_t; typedef struct { diff --git a/ext-net/example/plugin.c b/ext-net/example/plugin.c index 97a29875d..b0a9a4c59 100644 --- a/ext-net/example/plugin.c +++ b/ext-net/example/plugin.c @@ -11,7 +11,7 @@ int max_requests = NCCL_NET_MAX_REQUESTS; -__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; } +__hidden ncclResult_t pluginInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; } __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; } __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; } __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; } @@ -51,8 +51,8 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) { return ncclSuccess; } -__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; } -__hidden ncclResult_t pluginConnect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; } +__hidden ncclResult_t pluginListen(void* ctx, int dev, void* handle, void** listenComm) { return ncclInternalError; } +__hidden ncclResult_t pluginConnect(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; } __hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; } __hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; } __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; } @@ -67,10 +67,11 @@ __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalE __hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; } __hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; } __hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { return ncclInternalError; } +__hidden ncclResult_t pluginFinalize(void* ctx) { return ncclSuccess; } #define PLUGIN_NAME "Plugin" -const ncclNet_v10_t ncclNetPlugin_v10 = { +const ncclNet_v11_t ncclNetPlugin_v11 = { .name = PLUGIN_NAME, .init = pluginInit, .devices = pluginDevices, @@ -91,18 +92,84 @@ const ncclNet_v10_t ncclNetPlugin_v10 = { .getDeviceMr = pluginGetDeviceMr, .irecvConsumed = pluginIrecvConsumed, .makeVDevice = pluginMakeVDevice, + .finalize = pluginFinalize, +}; + +__hidden ncclResult_t pluginInit_v10(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; } +__hidden ncclResult_t pluginGetProperties_v10(int dev, ncclNetProperties_v10_t* props) { + // Below are default values, if unsure don't change. + + props->name = "Example"; + // Fill for proper topology detection, e.g. /sys/devices/pci0000:00/0000:00:10.0/0000:0b:00.0 + props->pciPath = NULL; + // Only used to detect NICs with multiple PCI attachments. + props->guid = 0; + // Add NCCL_PTR_CUDA if GPU Direct RDMA is supported and regMr can take CUDA pointers. + props->ptrSupport = NCCL_PTR_HOST; + // If you regMr has a fast registration cache, set to 1. If set to 0, user buffer registration may be disabled. + props->regIsGlobal = 0; + // Force flush after receive. Needed if the control path and data path use a different path to the GPU + props->forceFlush = 0; + // Speed in *Mbps*. 100000 means 100G + props->speed = 100000; + // Port number, used in conjunction with guid + props->port = 0; + // Custom latency (used to help tuning if latency is high. If set to 0, use default NCCL values. + props->latency = 0; + // Maximum number of comm objects we can create. + props->maxComms = 1024*1024; + // Maximum number of receive operations taken by irecv(). + props->maxRecvs = NCCL_PLUGIN_MAX_RECVS; + // Coupling with NCCL network device-side code. + props->netDeviceType = NCCL_NET_DEVICE_HOST; + props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; + // Used to tell NCCL core whether this is a virtual device fusing multiple physical devices. + props->vProps.ndevs = 1; + props->vProps.devs[0] = dev; + // maximum transfer sizes the plugin can handle + props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES; + props->maxCollBytes = NCCL_MAX_NET_SIZE_BYTES; + return ncclSuccess; +} + +__hidden ncclResult_t pluginListen_v10(int d, void* handle, void** listenComm) { return ncclInternalError; } +__hidden ncclResult_t pluginConnect_v10(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm) { return ncclInternalError; } +__hidden ncclResult_t pluginMakeVDevice_v10(int* d, ncclNetVDeviceProps_v10_t* props) { return ncclInternalError; } + +const ncclNet_v10_t ncclNetPlugin_v10 = { + .name = PLUGIN_NAME, + .init = pluginInit_v10, + .devices = pluginDevices, + .getProperties = pluginGetProperties_v10, + .listen = pluginListen_v10, + .connect = pluginConnect_v10, + .accept = pluginAccept, + .regMr = pluginRegMr, + .regMrDmaBuf = pluginRegMrDmaBuf, + .deregMr = pluginDeregMr, + .isend = pluginIsend, + .irecv = pluginIrecv, + .iflush = pluginIflush, + .test = pluginTest, + .closeSend = pluginCloseSend, + .closeRecv = pluginCloseRecv, + .closeListen = pluginCloseListen, + .getDeviceMr = pluginGetDeviceMr, + .irecvConsumed = pluginIrecvConsumed, + .makeVDevice = pluginMakeVDevice_v10, }; + __hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) { - return pluginInit(logFunction, NULL); + return pluginInit_v10(logFunction, NULL); } __hidden ncclResult_t pluginGetProperties_v9(int dev, ncclNetProperties_v9_t* props) { - return pluginGetProperties(dev, (ncclNetProperties_t*)props); + return pluginGetProperties_v10(dev, (ncclNetProperties_v10_t*)props); } __hidden ncclResult_t pluginConnect_v9(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm){ - return pluginConnect(dev, NULL, handle, sendComm, sendDevComm); + return pluginConnect_v10(dev, NULL, handle, sendComm, sendDevComm); } __hidden ncclResult_t pluginIsend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { @@ -120,7 +187,7 @@ const ncclNet_v9_t ncclNetPlugin_v9 = { .init = pluginInit_v9, .devices = pluginDevices, .getProperties = pluginGetProperties_v9, - .listen = pluginListen, + .listen = pluginListen_v10, .connect = pluginConnect_v9, .accept = pluginAccept, .regMr = pluginRegMr, @@ -172,7 +239,7 @@ const ncclNet_v8_t ncclNetPlugin_v8 = { .init = pluginInit_v9, .devices = pluginDevices, .getProperties = pluginGetProperties_v8, - .listen = pluginListen, + .listen = pluginListen_v10, .connect = pluginConnect_v9, .accept = pluginAccept, .regMr = pluginRegMr, @@ -216,7 +283,7 @@ const ncclNet_v7_t ncclNetPlugin_v7 = { .init = pluginInit_v9, .devices = pluginDevices, .getProperties = pluginGetProperties_v7, - .listen = pluginListen, + .listen = pluginListen_v10, .connect = pluginConnect_v9, .accept = pluginAccept, .regMr = pluginRegMr_v7, @@ -257,7 +324,7 @@ const ncclNet_v6_t ncclNetPlugin_v6 = { .init = pluginInit_v9, .devices = pluginDevices, .getProperties = pluginGetProperties_v6, - .listen = pluginListen, + .listen = pluginListen_v10, .connect = pluginConnect_v6, .accept = pluginAccept_v6, .regMr = pluginRegMr_v7, @@ -278,7 +345,7 @@ const ncclNet_v5_t ncclNetPlugin_v5 = { .init = pluginInit_v9, .devices = pluginDevices, .getProperties = pluginGetProperties_v6, - .listen = pluginListen, + .listen = pluginListen_v10, .connect = pluginConnect_v6, .accept = pluginAccept_v6, .regMr = pluginRegMr_v7, @@ -320,7 +387,7 @@ static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) { ncclResult_t ret; do { ncclNetDeviceHandle_v7_t* handle = NULL; - ret = pluginConnect(dev, NULL, handle, sendComm, &handle); + ret = pluginConnect_v10(dev, NULL, handle, sendComm, &handle); } while (ret == ncclSuccess && *sendComm == NULL); return ret; } @@ -337,7 +404,7 @@ const ncclNet_v4_t ncclNetPlugin_v4 = { .init = pluginInit_v9, .devices = pluginDevices, .getProperties = pluginGetProperties_v4, - .listen = pluginListen, + .listen = pluginListen_v10, .connect = pluginConnect_v4, .accept = pluginAccept_v4, .regMr = pluginRegMr_v7, @@ -363,12 +430,12 @@ static ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhan } static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) { max_requests = NCCL_NET_MAX_REQUESTS_V3; - return pluginInit(logFunction, NULL); + return pluginInit_v10(logFunction, NULL); } #include static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) { char pluginHandle[NCCL_NET_HANDLE_MAXSIZE]; - ncclResult_t ret = pluginListen(dev, &pluginHandle, listenComm); + ncclResult_t ret = pluginListen_v10(dev, &pluginHandle, listenComm); memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V4); return ret; } @@ -403,7 +470,7 @@ const ncclNet_v2_t ncclNetPlugin_v2 = { .devices = pluginDevices, .pciPath = pluginPciPath, .ptrSupport = pluginPtrSupport, - .listen = pluginListen, + .listen = pluginListen_v3, .connect = pluginConnect_v4, .accept = pluginAccept_v4, .regMr = pluginRegMr_v7, diff --git a/ext-profiler/README.md b/ext-profiler/README.md index 27bd4e25c..1d85213a6 100644 --- a/ext-profiler/README.md +++ b/ext-profiler/README.md @@ -49,9 +49,9 @@ of newer ones. The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions from old API versions. It also provides error codes in `err.h`. -# API (v4) +# API (v5) -Below is the main `ncclProfiler_v4` struct. Each function is explained in later sections. +Below is the main `ncclProfiler_v5` struct. Each function is explained in later sections. ``` typedef struct { @@ -60,15 +60,15 @@ typedef struct { // init - initialize the profiler plugin // Input // - context : opaque profiler context object for separating profiler behavior across comms + // - commId : communicator id // - commName : user assigned communicator name - // - commHash : communicator id // - nNodes : number of nodes in communicator // - nranks : number of ranks in communicator // - rank : rank identifier in communicator // - logfn : logger function // Output // - eActivationMask: bitmask of active events set by the plugin - ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn); + ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn); // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset // Input @@ -76,7 +76,7 @@ typedef struct { // - eDescr : pointer to ncclProfilerEventDescr_t object // Output // - eHandle: return event handle for supplied event descriptor object - ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr); + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr); // stopEvent - stop/finalize an event inside and event set // Input @@ -88,13 +88,13 @@ typedef struct { // - eHandle : handle to event object created through startEvent // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition // - eState : event state transition - ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs); + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs); // finalize - finalize the profiler plugin // Input // - context: opaque profiler context object ncclResult_t (*finalize)(void* context); -} ncclProfiler_v4_t; +} ncclProfiler_v5_t; ``` ## Error codes @@ -148,10 +148,37 @@ is the `ncclProfilerEventDescr_t` struct. ``` typedef struct { - uint8_t type; // event type (e.g., ncclProfileGroup, ncclProfileColl, ...) - void* parentObj; // pointer to parent event used to expose the event hierarchy to the profiler - int rank; // rank that generated the event + uint64_t type; // event type descriptor: ncclProfileGroupApi, ncclProfileCollApi, ... + void* parentObj; // pointer to parent event used to expose the event hierarchy to the profiler + int rank; // rank that generated the event union { + struct { // GroupAPI event metadata + bool graphCaptured; // Set to true if the Group API event is emitted inside a CUDA graph capture + int groupDepth; // Determines the depth of a ncclGroup. A depth of 1 implies that the Group API call is implicit (internal to NCCL) + // and not called by the user. Any depth greater than 1 means that the user made the Group API call. + } groupApi; + + struct { // Collective API call metadata + const char* func; // string containing name of the collective operation during + size_t count; // data count + const char* datatype; // string containing the name of the datatype + int root; // root rank + void* stream; // Opaque handle that points to the CUDA stream that the operation is enqueued in + bool graphCaptured; // Set to true if the Collective API event is emitted inside a CUDA graph capture + } collApi; + + struct { // Point-to-point API call metadata + const char* func; // string containing name of the p2p operation + size_t count; // data count + const char* datatype; // string containing the name of the datatype + void* stream; // Opaque handle that points to a CUDA stream object + bool graphCaptured; // Set to true if the Collective API event is emitted inside a CUDA graph capture + } p2pApi; + + struct { // Kernel Launch event metadata + void* stream; // Opaque handle that points to the CUDA stream that the operation is enqueued in + } kernelLaunch; + struct { // collective events metadata uint64_t seqNumber; // sequence number of this collective operation in the communicator const char* func; // string containing name of the collective @@ -164,6 +191,7 @@ typedef struct { uint8_t nWarps; // number of GPU warps for this collective const char* algo; // string containing name of the algorithm for this collective const char* proto; // string containing name of the protocol for this collective + void* parentGroup; // for backward compatibility with v4 - this points to the legacy v4 group parent } coll; struct { // point-to-point events metadata @@ -173,6 +201,7 @@ typedef struct { size_t count; int peer; // peer rank for this point-to-point uint8_t nChannels; // number of channels for this p2p + void* parentGroup; // for backward compatibility with v4 - this points to the legacy v4 group parent } p2p; struct { // proxyOp events metadata @@ -198,12 +227,12 @@ typedef struct { void* data; // pointer to network plugin defined event } netPlugin; }; -} ncclProfilerEventDescr_v4_t; +} ncclProfilerEventDescr_v5_t; ``` -NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`, -`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`, `ncclProfileKernelCh` and -`ncclProfileNetPlugin`. +NCCL defines the following events: `ncclProfileGroupApi`, `ncclProfileCollApi`, `ncclProfileP2pApi`, `ncclProfileKernelLaunch`, +`ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`, +`ncclProfileKernelCh` and `ncclProfileNetPlugin`. #### stopEvent @@ -213,10 +242,10 @@ handle after `eventStop` is undefined behavior. #### recordEventState -Some events can only be started and stopped. For example, `ncclProfileGroup`, `ncclProfileColl`, -`ncclProfileP2p`, cannot be updated through calls to `recordEventState`. +Some events can only be started and stopped. For example, `ncclProfileP2pApi`, `ncclProfileCollApi`, `ncclProfileGroup`, +`ncclProfileColl`, `ncclProfileP2p` cannot be updated through calls to `recordEventState`. -`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileNetPlugin`, `ncclProfileKernelCh`, and +`ncclProfileGroupApi`, `ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileNetPlugin`, `ncclProfileKernelCh`, and `ncclProfileProxyCtrl` can be updated through calls to `recordEventState`. The state of these events can be updated, along with event attributes, using `recordEventState`. @@ -258,9 +287,21 @@ typedef enum { // ncclProfileKernelCh event states ncclProfilerKernelChStop = 22,// state marks stop of kernelCh event and timestamp update -} ncclProfilerEventState_v4_t; + + // Group API States + ncclProfilerGroupStartApiStop = 23,// state marks the end of a ncclGroupStart() API call + ncclProfilerEndGroupApiStart = 24 // state marks the start of a ncclGroupEnd() API call +} ncclProfilerEventState_v5_t; ``` +NCCL profile API events are generated when the API calls are made, right after NCCL checks +for graph capture information. They parent collective, point-to-point and kernel launch events +and persist across multiple operations in a group. + +`ncclProfileKernelLaunch` events are generated when the CUDA call to a kernel launch is made. In the +case of graph capture, the event start indicates that the kernel launch operation has been recorded, +not launched. + `ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing network requests for the GPU kernel. ProxyOp events are generated for every active channel and provide a summary of the activity of the proxy progress thread for that channel. Most of the @@ -379,7 +420,7 @@ typedef union { struct { // attribute to update for ncclProfileKernelCh events uint64_t pTimer; // timestamp provided by the NCCL kernel } kernelCh; -} ncclProfilerEventStateArgs_v4_t; +} ncclProfilerEventStateArgs_v5_t; ``` The example profiler in `ext-profiler/example` contains details on how to capture and use the events above. @@ -389,27 +430,33 @@ The example profiler in `ext-profiler/example` contains details on how to captur NCCL core events (reported above) are organized into a hierarchy as reported below: ``` -Group event +Group API event | - +- Collective event + +- Collective API event | | - | +- ProxyOp event - | | | - | | +- ProxyStep event - | | | - | | +- NetPlugin event + | +- Collective event + | | + | +- ProxyOp event + | | | + | | +- ProxyStep event + | | | + | | +- NetPlugin event + | | + | +- KernelCh event + | + +- Point-to-point API event | | - | +- KernelCh event + | +- Point-to-point event + | | + | +- ProxyOp event + | | | + | | +- ProxyStep event + | | | + | | +- NetPlugin event + | | + | +- KernelCh event | - +- Point-to-point event - | - +- ProxyOp event - | | - | +- ProxyStep event - | | - | +- NetPlugin event - | - +- KernelCh event + +- Kernel Launch event ProxyCtrl event ``` diff --git a/ext-profiler/example/CMakeLists.txt b/ext-profiler/example/CMakeLists.txt new file mode 100644 index 000000000..fd2f04df6 --- /dev/null +++ b/ext-profiler/example/CMakeLists.txt @@ -0,0 +1,34 @@ +# Find all C source files in current directory +set(SRC_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/plugin.cc + ${CMAKE_CURRENT_SOURCE_DIR}/print_event.cc +) + +# Create shared library +add_library(nccl-profiler-example SHARED ${SRC_FILES}) + +# Set include directories +target_include_directories(nccl-profiler-example PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/nccl + ${CUDAToolkit_INCLUDE_DIRS} +) + +# Set output name to match Makefile +set_target_properties(nccl-profiler-example PROPERTIES + OUTPUT_NAME "nccl-profiler-example" + PREFIX "lib" + POSITION_INDEPENDENT_CODE ON + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib +) + +add_custom_command(TARGET nccl-profiler-example POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/test/unit/plugins + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/lib/libnccl-profiler-example.so ${CMAKE_BINARY_DIR}/test/unit/plugins +) + +# Add custom target for clean (equivalent to Makefile clean target) +add_custom_target(clean-profiler-lib + COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_BINARY_DIR}/lib/libnccl-profiler-example.so + COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_BINARY_DIR}/test/unit/plugins/libnccl-profiler-example.so + COMMENT "Cleaning libnccl-profiler-example.so" +) diff --git a/ext-profiler/example/Makefile b/ext-profiler/example/Makefile index 777ff5bad..f6383e1b6 100644 --- a/ext-profiler/example/Makefile +++ b/ext-profiler/example/Makefile @@ -5,18 +5,20 @@ # .DEFAULT_GOAL: build include ../../makefiles/common.mk -SRCDIR ?= $(abspath ../..) BUILDDIR ?= . NCCLDIR := $(BUILDDIR) -SRC_FILES := $(wildcard *.c) +SRC_FILES := $(wildcard *.cc) +DST_DIR := $(BUILDDIR) +OBJ_FILES := $(SRC_FILES:%.cc=${DST_DIR}/%.o) +DEP_FILES := $(OBJ_FILES:%.o=%.dep) -build: ${BUILDDIR}/libnccl-profiler-example.so +build: ${DST_DIR}/libnccl-profiler-example.so -${BUILDDIR}/libnccl-profiler-example.so: ${SRC_FILES} +${DST_DIR}/libnccl-profiler-example.so: ${SRC_FILES} @printf "Compiling %-35s > %s\n" $< $@ - @mkdir -p ${BUILDDIR} - $(CC) -Inccl -fPIC -shared -o $@ $^ + @mkdir -p ${DST_DIR} + $(CXX) -Inccl -I${CUDA_INC} -fPIC -shared -o $@ $^ clean: - rm -f ${BUILDDIR}/libnccl-profiler-example.so + rm -f ${DST_DIR}/libnccl-profiler-example.so diff --git a/ext-profiler/example/README.md b/ext-profiler/example/README.md index d98e58f15..abc11a57e 100644 --- a/ext-profiler/example/README.md +++ b/ext-profiler/example/README.md @@ -13,8 +13,7 @@ change the size of the event window the profiler keeps track of. ## Building the profiler plugin -To use the example plugin, just type `make`. You will need a NCCL build's include directory present. -You can override `NCCL_HOME` to where the NCCL installation is on your system. +To build the example plugin shipped as part of NCCL, just type `make`. ## Using the profiler plugin @@ -27,13 +26,13 @@ You can override `NCCL_HOME` to where the NCCL installation is on your system. As an example, setting: - `NCCL_PROFILE_EVENT_MASK` to 1 (`ncclProfileGroup`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`) + `NCCL_PROFILE_EVENT_MASK` to 256 (`ncclProfileGroupApi`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`) - enables the profiling of the group, the collective and the proxy op events. The same events can be + enables the profiling of the group API, the collective and the proxy op events. The same events can be expressed more concisely by setting `NCCL_PROFILE_EVENT_MASK` to 8 (`ncclProfileProxyOp`). Indeed, in NCCL all the events above (in the event hierarchy) the one requested are also captured. The advantage is that the profiler can easily correlate events that belong to the same NCCL operation and present - them accordingly. + them accordingly. Setting `NCCL_PROFILE_EVENT_MASK` to 4095 enables all events supported by the v5 profiler. 3. Set `NCCL_PROFILE_DUMP_FILE` to the name of the dump file for the collected traces. A file named ${NCCL_PROFILE_DUMP_FILE}-hostname-tid.txt is created. Profiler traces are saved using the chrome @@ -57,11 +56,14 @@ The group, collective and p2p pools contain objects for the corresponding events contains objects for `ProxyCtrl` events and the `ProxyDetach` pool contains objects for `ProxyOp` events generated by remote proxies. A list of pools and their size is reported below: -- `NCCL_PROFILE_GROUP_POOL_SIZE` (16) -- `NCCL_PROFILE_COLL_POOL_SIZE` (16) -- `NCCL_PROFILE_P2P_POOL_SIZE` (1024) +- `NCCL_PROFILE_GROUP_API_POOL_SIZE` (256) +- `NCCL_PROFILE_COLL_API_POOL_SIZE` (256) +- `NCCL_PROFILE_P2P_API_POOL_SIZE` (256) +- `NCCL_PROFILE_KERNEL_LAUNCH_POOL_SIZE` (256) +- `NCCL_PROFILE_COLL_POOL_SIZE` (256) +- `NCCL_PROFILE_P2P_POOL_SIZE` (256) - `NCCL_PROFILE_PROXY_CTRL_POOL_SIZE` (16) -- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (128) +- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (256) Remote proxy operations are generated when PXN is in use. Refer to this article for more information about PXN and how it works: @@ -73,76 +75,58 @@ The example profiler generates traces using the json format. An example of trace ``` [ -{"name": "Group", "cat": "GROUP", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764234.611328, "args": {"groupId": 0}}, -{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764237.294922, "args": {"SeqNum": 0, "CommHash": 673864846479792718, "Rank": 1, "Count": 32768, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "LL", "nMaxChannels": 2}}, -{"name": "Recv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768464.936523, "args": {"Channel": 0, "Peer": 0, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 772020.300781}, "RECEIVED": {"step": 14, "ts": 772196.049805}, "TRANSMITTED": {"step": 14, "ts": 772197.326172}, "DONE": {"step": 14, "ts": 772201.538086}}}, -{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768465.158203, "args": {"Step": 0}}, -{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805}, -{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805, "args": {"Step": 0}}, -{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266}, -{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266, "args": {"Step": 0}}, -{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805}, -{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805, "args": {"Step": 0}}, -{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768568.276367}, -{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768503.604492, "args": {"Step": 1}}, -{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805}, -{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805, "args": {"Step": 1}}, -{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234}, -{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234, "args": {"Step": 1}}, -{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695}, -{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695, "args": {"Step": 1}}, -{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 770006.914062}, -{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768506.941406, "args": {"Step": 2}}, -{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547}, -{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547, "args": {"Step": 2}}, -{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133}, -{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133, "args": {"Step": 2}}, -{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547}, -{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547, "args": {"Step": 2}}, -{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771468.458008}, -{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768509.484375, "args": {"Step": 3}}, -{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000}, -{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000, "args": {"Step": 3}}, -{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023}, -{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023, "args": {"Step": 3}}, -{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211}, -{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211, "args": {"Step": 3}}, -{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771910.500000}, -{"name": "Send", "cat": "PROXY", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768482.878906, "args": {"Channel": 0, "Peer": 2, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 771995.675781}, "REM_FIFO_WAIT": {"step": 14, "ts": 772190.692383}, "TRANSMITTED": {"step": 14, "ts": 772191.516602}, "DONE": {"step": 14, "ts": 772208.473633}}}, -{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.019531, "args": {"Step": 0}}, -{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781}, -{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781, "args": {"Step": 0}}, -{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234}, -{"name": "SendWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234, "args": {"Step": 0}}, -{"name": "SendWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769618.889648}, -{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.083008, "args": {"Step": 1}}, -{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086}, -{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086, "args": {"Step": 1}}, -{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664}, -{"name": "SendWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664, "args": {"Step": 1}}, -{"name": "SendWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769622.517578}, -{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768507.937500, "args": {"Step": 2}}, -{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578}, -{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578, "args": {"Step": 2}}, -{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883}, -{"name": "SendWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883, "args": {"Step": 2}}, -{"name": "SendWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770013.848633}, -{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.742188, "args": {"Step": 3}}, -{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266}, -{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266, "args": {"Step": 3}}, -{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477}, -{"name": "SendWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477, "args": {"Step": 3}}, -{"name": "SendWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771469.171875}, +{"name": "Group API", "cat": "GROUP_API", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 3433.595001, "args": {"groupApiId": 0, "groupDepth":1}}, +{"name": "KernelLaunch", "cat": "KERNEL_LAUNCH", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 0.000000, "args": {"groupId": 0, "Stream": 0x5020000567d0}}, +{"name": "KernelLaunch", "cat": "KERNEL_LAUNCH", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 111991.558990}, +{"name": "AllReduce", "cat": "COLL_API", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 0.000000, "args": {"count": 262144, "datatype": ncclFloat32, "root": 0, "GraphCaptured":0, "Stream": 0x5020000567d0}}, +{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 111994.477997, "args": {"SeqNum": 0, "CommHash": 1493613951195738943, "Rank": 0, "Count": 262144, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "SIMPLE", "nChannels": 2}}, +{"name": "KernelCh", "cat": "GPU", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119711.888000, "args": {"Channel": 0, "StartGpuClk": 1756135989724672000, "StopGpuClk": 1756135989732831232}}, +{"name": "ScheduleRecv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119652.709991, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}}, +{"name": "ScheduleRecv", "cat": "PROXY", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119686.300995}, +{"name": "ProgressRecv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119686.300995, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}}, +{“name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119707.677979, "args": {"Step": 0}}, +{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119807.691986}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119807.691986, "args": {"Step": 0}}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119867.338989}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119867.338989, "args": {"Step": 0}}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 120120.983002}, +{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119733.647980, "args": {"Step": 1}}, +{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119844.401001}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119844.401001, "args": {"Step": 1}}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119890.567993}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119890.567993, "args": {"Step": 1}}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 120121.129974}, +{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 119753.023987, "args": {"Step": 2}}, +{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120038.847992}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 120038.847992, "args": {"Step": 2}}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120085.685974}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 120085.685974, "args": {"Step": 2}}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120121.244995}, +{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 119772.510986, "args": {"Step": 3}}, +{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120062.944977}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 120062.944977, "args": {"Step": 3}}, +{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120101.089996}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 120101.089996, "args": {"Step": 3}}, +{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120165.115997}, +{"name": "ProgressRecv", "cat": "PROXY", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 120165.356995}, +{"name": "ScheduleSend", "cat": "PROXY", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119656.950989, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}}, +{"name": "ScheduleSend", "cat": "PROXY", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119709.078979}, +{"name": "ProgressSend", "cat": "PROXY", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119709.078979, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}}, +{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 4, "pid": 225798, "tid": 1, "ts": 119710.632996, "args": {"Step": 0}}, +{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 4, "pid": 225798, "tid": 1, "ts": 119808.636993}, +{"name": "SendPeerWait", "cat": "NET", "ph": "b", "id": 4, "pid": 225798, "tid": 1, "ts": 119808.636993, "args": {"Step": 0}}, +{"name": "SendPeerWait", "cat": "NET", "ph": "e", "id": 4, "pid": 225798, "tid": 1, "ts": 119818.972992}, ... [ trace truncated for brevity ] -{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.317383}, -{"name": "Group", "cat": "GROUP", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.418945}, +{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170633.535980}, +{"name": "AllReduce", "cat": "COLL_API", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170582.923981}, +{"name": "Group API", "cat": "GROUP_API", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170637.582001}, {}] ``` Details about the fields used in the trace can be found at this link: https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0#heading=h.yr4qxyxotyw -The trace above is obtained by running a `ncclAllReduce` operation on 8 GPUs, communicating with each other through +The trace above is obtained by running a `ncclAllReduce` operation on 2 GPUs, communicating with each other through the network interface. The `Group` event encloses all traces that are related to the single `ncclAllReduce` call. (Note that for single collective invocations, where there are no explicit group calls, NCCL creates a group with only one collective and this is what is presented in the traces above). @@ -161,38 +145,17 @@ The `AllReduce` entry presents information about the `ncclAllReduce` operation. - datatype : NCCL datatype - algorithm : algorithm used to process the ncclAllReduce - protocol : protocol used to process the ncclAllReduce -- nMaxChannels: max number of channels used to process the ncclAllReduce +- nChannels : Number of channels used to process the ncclAllReduce If the proxy events are not active (e.g., the `ncclAllReduce` is intranode) the end timestamp will match the time consumed by the CPU to launch the collective. For more details refer to `ext-profiler/README.md`, section `Profiling of collective and p2p operations`. -### Proxy Send -The `Send` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following -info in the args field: - -- Channel : id of the channel used by this proxy operation to send data to the peer -- Peer : peer rank -- Steps : number of network steps required to transfer transSize bytes to the peer -- ChunkSize : chunk size used by NCCL to pipeline data through the proxy thread -- transSize : bytes transferred across the channel by this proxy operation -- POSTED : struct containing the number of buffer posts to the GPU and the time stamp for the last post -- REM_FIFO_WAIT: struct containing the number of remote buffer waits and the time stamp for the last wait -- TRANSMITTED : struct containing the number of network sends and the time stamp of the last send -- DONE : struct containing the number of network sends completed and the time stamp of the last send completed - -In case of a network problem the POSTED, REM_FIFO_WAIT, TRANSMITTED and DONE might all have partially updated steps, -which could help identify at which point the network problem occurred. - The Proxy send trace gives a summary of the proxy progress thread activity for the channel. If more details are needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace entries below are also reported by the profiler. -#### Proxy SendBufferWait - -Presents, for every network step, the time the CPU proxy spends waiting for the channel staging buffer to become available. - -#### Proxy SendGPUWait +#### Proxy SendGpuWait Presents, for every network step, the time the CPU proxy spends waiting for the GPU to provide the data in the staging buffer. @@ -201,31 +164,6 @@ buffer. Presents, for every network step, the time the CPU proxy spends waiting for the `isend` to complete -### Proxy Recv - -The `Recv` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following -info in the args field: - -- Channel : id of the channel used by this proxy operation to recv data from the peer -- Peer : peer rank -- Steps : number of network steps required to transfer transSize bytes from the peer -- ChunkSize : chunk size used by NCCL to pipeline data through the proxy thread -- transSize : bytes transferred across the channel by this proxy operation -- POSTED : struct containing the number of recvs posted and the time stamp for the last recv posted -- RECEIVED : struct containing the number of recvs completed and the time stamp for the last recv completed -- TRANSMITTED: struct containing the number of recvs flushed to the GPU memory and the time stamp for the last recv flushed -- DONE : struct containing the number of flush completed and the time stamp for the last flush completed - -The Proxy Recv trace gives a summary of the proxy progress thread activity for the channel. If more details are -needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace -entries below are also reported by the profiler. - - -#### Proxy RecvBufferWait - -Presents, for every network step, the time the CPU proxy spends waiting for the staging buffer for the channel to -become available. - #### Proxy RecvWait Presents, for every network step, the time the CPU proxy spends waiting for a posted `irecv` to complete @@ -234,6 +172,6 @@ Presents, for every network step, the time the CPU proxy spends waiting for a po Presents, for every network step, the time the CPU proxy spends waitng for the recv data to be flushed to the GPU -#### Proxy RecvGPUWait +#### Proxy RecvGpuWait Presents, for every network step, the time the CPU proxy spends waiting for the GPU to consume the recv data diff --git a/ext-profiler/example/event.c b/ext-profiler/example/event.c deleted file mode 100644 index 717fe8688..000000000 --- a/ext-profiler/example/event.c +++ /dev/null @@ -1,30 +0,0 @@ -/************************************************************************* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include -#include "event.h" - -int taskEventQueueEmpty(struct group* g) { - return g->eventHead == NULL; -} - -void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) { - event->next = NULL; - if (g->eventHead) g->eventTail->next = event; - else g->eventHead = event; - g->eventTail = event; -} - -struct taskEventBase* taskEventQueueHead(struct group* g) { - return g->eventHead; -} - -struct taskEventBase* taskEventQueueDequeue(struct group* g) { - struct taskEventBase* tmp = g->eventHead; - g->eventHead = g->eventHead->next; - if (g->eventHead == NULL) g->eventTail = NULL; - return tmp; -} diff --git a/ext-profiler/example/event.h b/ext-profiler/example/event.h index 4c1b8f53a..ae830cd25 100644 --- a/ext-profiler/example/event.h +++ b/ext-profiler/example/event.h @@ -10,10 +10,14 @@ #include #include #include +#include +#include "err.h" #include "profiler.h" +#include "queue.h" +#include #define MAX_CHANNELS 32 -#define MAX_STEPS 16 +#define MAX_STEPS 1024 #define MAX_OPS 16 // Up to 64K ranks for PAT #define MAX_EVENTS_PER_REQ (8) @@ -21,7 +25,7 @@ struct proxyOp; struct proxyStep; struct netPlugin { - uint8_t type; + uint64_t type; int pluginType; int pluginVer; uint8_t pluginEvent; @@ -63,7 +67,7 @@ struct kernelCh { #define PROXY_STEP_MAX_STATES 3 struct proxyStep { - uint8_t type; // type of event: network transfer + uint64_t type; // type of event: network transfer int state; int step; // network transfer id in given channel int isSend; // send/recv channel operation @@ -76,7 +80,7 @@ struct proxyStep { }; struct proxyOp { - uint8_t type; // type of event: proxy operation + uint64_t type; // type of event: proxy operation uint8_t channelId; // channel id for this proxy operation pid_t pid; int rank; @@ -97,7 +101,7 @@ struct group; struct context; struct proxyCtrl { - uint8_t type; + uint64_t type; struct context* ctx; // profiler context double startTs; double stopTs; @@ -107,12 +111,12 @@ struct proxyCtrl { // task level event base structure struct taskEventBase { - uint8_t type; // event type: collective/p2p + uint64_t type; // event type: collective/p2p int rank; // rank of the operation in NCCL communicator const char* func; // ncclFunc* int refCount; // number of references for this operation - struct group* parent; // parent event group - struct taskEventBase* next; // next top level event in group + void* parent; // parent API event + struct taskEventBase* next; // next top level event double startTs; double stopTs; }; @@ -147,7 +151,7 @@ struct p2p { }; struct group { - uint8_t type; + uint64_t type; struct context* ctx; // profiler context int groupId; int refCount; @@ -158,6 +162,70 @@ struct group { struct group* next; // next group event in queue }; +struct collApi { + uint64_t type; + struct groupApi* parent; + struct context* ctx; // profiler context + int collApiId; + int refCount; + cudaStream_t stream; + const char* func; + size_t count; + const char* datatype; + int root; + bool graphCaptured; + struct taskEventBase* eventHead; // queue head for task events + struct taskEventBase* eventTail; // queue tail for task events + double startTs; + double stopTs; + struct collApi* next; +}; + +struct p2pApi { + uint64_t type; + struct groupApi* parent; + struct context* ctx; // profiler context + int p2pApiId; + int refCount; + const char* func; + cudaStream_t stream; + size_t count; + const char* datatype; + bool graphCaptured; + struct taskEventBase* eventHead; // queue head for task events + struct taskEventBase* eventTail; // queue tail for task events + double startTs; + double stopTs; + struct p2pApi* next; +}; + +struct kernelLaunch { + uint64_t type; + struct groupApi* parent; + cudaStream_t stream; + int kernelLaunchId; + double startTs; + double stopTs; + struct kernelLaunch* next; +}; + +struct groupApi { + uint64_t type; + struct context* ctx; + int groupApiId; + int refCount; + bool graphCaptured; + int groupDepth; + struct profilerQueue p2pApiEvents; + struct profilerQueue collApiEvents; + struct profilerQueue kernelLaunchEvents; + double endOfncclGroupStartTs; + double startOfncclGroupEndTs; + double startTs; + double stopTs; + struct groupApi* next; +}; + // arrays for different event objects struct context { const char* commName; @@ -165,6 +233,26 @@ struct context { int nranks; int rank; + int groupApiPoolSize; + int groupApiPoolBase; + int groupApiPoolIndex; + struct groupApi* groupApiPool; + + int collApiPoolSize; + int collApiPoolBase; + int collApiPoolIndex; + struct collApi* collApiPool; + + int p2pApiPoolSize; + int p2pApiPoolBase; + int p2pApiPoolIndex; + struct p2pApi* p2pApiPool; + + int kernelLaunchPoolSize; + int kernelLaunchPoolBase; + int kernelLaunchPoolIndex; + struct kernelLaunch* kernelLaunchPool; + int groupPoolSize; int groupPoolBase; int groupPoolIndex; @@ -186,9 +274,50 @@ struct context { struct proxyCtrl* proxyCtrlPool; }; -int taskEventQueueEmpty(struct group* g); -void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event); -struct taskEventBase* taskEventQueueHead(struct group* g); -struct taskEventBase* taskEventQueueDequeue(struct group* g); +template +inline int taskEventQueueEmpty(T *obj) { + return obj->eventHead == NULL; +} + +template +inline void taskEventQueueEnqueue(T* obj, struct taskEventBase* event) { + event->next = NULL; + if (obj->eventHead) obj->eventTail->next = event; + else obj->eventHead = event; + obj->eventTail = event; +} + +template +inline struct taskEventBase* taskEventQueueHead(T *obj) { + return obj->eventHead; +} + +template +inline struct taskEventBase* taskEventQueueDequeue(T* obj) { + struct taskEventBase* tmp = obj->eventHead; + obj->eventHead = obj->eventHead->next; + if (obj->eventHead == NULL) obj->eventTail = NULL; + return tmp; +} + +template +inline void resetTaskEvents(T *obj, struct context* ctx) { + while (!taskEventQueueEmpty(obj)) { + struct taskEventBase* base = taskEventQueueDequeue(obj); + if (base->type == ncclProfileColl) { + struct collective* c = (struct collective *)base; + // reset event proxyOps & proxySteps + memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS); + // release collective events in the group and return them to the collective pool + __atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED); + } else if (base->type == ncclProfileP2p) { + struct p2p* p = (struct p2p *)base; + // reset event proxyOp and proxySteps + memset(&p->op, 0, sizeof(struct proxyOp)*MAX_CHANNELS); + // release p2p events in the group and return them to the p2p pool + __atomic_fetch_add(&ctx->p2pPoolBase, 1, __ATOMIC_RELAXED); + } + } +} #endif diff --git a/ext-profiler/example/nccl/profiler.h b/ext-profiler/example/nccl/profiler.h index c911426d9..715885f72 100644 --- a/ext-profiler/example/nccl/profiler.h +++ b/ext-profiler/example/nccl/profiler.h @@ -11,17 +11,20 @@ #include #include "common.h" -#include "err.h" enum { - ncclProfileGroup = (1 << 0), // group event type - ncclProfileColl = (1 << 1), // host collective call event type - ncclProfileP2p = (1 << 2), // host point-to-point call event type - ncclProfileProxyOp = (1 << 3), // proxy operation event type - ncclProfileProxyStep = (1 << 4), // proxy step event type - ncclProfileProxyCtrl = (1 << 5), // proxy control event type - ncclProfileKernelCh = (1 << 6), // kernel channel event type - ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events + ncclProfileGroup = (1 << 0), // group event type + ncclProfileColl = (1 << 1), // host collective call event type + ncclProfileP2p = (1 << 2), // host point-to-point call event type + ncclProfileProxyOp = (1 << 3), // proxy operation event type + ncclProfileProxyStep = (1 << 4), // proxy step event type + ncclProfileProxyCtrl = (1 << 5), // proxy control event type + ncclProfileKernelCh = (1 << 6), // kernel channel event type + ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events + ncclProfileGroupApi = (1 << 8), // Group API events + ncclProfileCollApi = (1 << 9), // Collective API events + ncclProfileP2pApi = (1 << 10), // Point-to-Point API events + ncclProfileKernelLaunch = (1 << 11), // Kernel launch events }; typedef enum { @@ -56,21 +59,27 @@ typedef enum { /* Kernel event states */ ncclProfilerKernelChStop = 22, + + /* Group API States */ + ncclProfilerEndGroupApiStart = 23, + ncclProfilerBeginGroupApiEnd = 24 } ncclProfilerEventState_t; typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t; typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t; typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t; typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t; +typedef ncclProfilerEventState_t ncclProfilerEventState_v5_t; +#include "profiler_v5.h" #include "profiler_v4.h" #include "profiler_v3.h" #include "profiler_v2.h" #include "profiler_v1.h" #include "profiler_net.h" -typedef ncclProfiler_v4_t ncclProfiler_t; -typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t; -typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t; +typedef ncclProfiler_v5_t ncclProfiler_t; +typedef ncclProfilerEventDescr_v5_t ncclProfilerEventDescr_t; +typedef ncclProfilerEventStateArgs_v5_t ncclProfilerEventStateArgs_t; #endif // end include guard diff --git a/ext-profiler/example/nccl/profiler_v5.h b/ext-profiler/example/nccl/profiler_v5.h new file mode 100644 index 000000000..8bbc85eeb --- /dev/null +++ b/ext-profiler/example/nccl/profiler_v5.h @@ -0,0 +1,152 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_V5_H_ +#define PROFILER_V5_H_ +#include + +typedef struct { + uint64_t type; // event type descriptor: ncclProfileGroupApi, ... + void* parentObj; // pointer to the profiler parent object + int rank; // originating rank + union { + struct { + int graphCaptured; + int groupDepth; + } groupApi; + + struct { + const char* func; + size_t count; + const char* datatype; + int root; + void* stream; + bool graphCaptured; + } collApi; + + struct { + const char* func; + size_t count; + const char* datatype; + void* stream; + bool graphCaptured; + } p2pApi; + + struct { + void* stream; + } kernelLaunch; + + struct { + uint64_t seqNumber; + const char* func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + const char* datatype; + uint8_t nChannels; + uint8_t nWarps; + const char* algo; + const char* proto; + void* parentGroup; // for backward compatibility with v4 + } coll; + + struct { + const char* func; + void* buff; + const char* datatype; + size_t count; + int peer; + uint8_t nChannels; + void* parentGroup; // for backward compatibility with v4 + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + + struct { + uint8_t channelId; + uint64_t pTimer; // start timestamp from GPU globaltimer + } kernelCh; + + struct { + int64_t id; + void* data; + } netPlugin; + }; +} ncclProfilerEventDescr_v5_t; + +typedef union { + struct { + size_t transSize; + } proxyStep; + + struct { + int appendedProxyOps; + } proxyCtrl; + + struct { + void* data; + } netPlugin; + + struct { + uint64_t pTimer; + } kernelCh; +} ncclProfilerEventStateArgs_v5_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // - commId : communicator id + // - commName : user assigned communicator name + // - nNodes : number of nodes in communicator + // - nranks : number of ranks in communicator + // - rank : rank identifier in communicator + // - logfn : logger function + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v5_t; + +#endif diff --git a/ext-profiler/example/plugin.c b/ext-profiler/example/plugin.cc similarity index 68% rename from ext-profiler/example/plugin.c rename to ext-profiler/example/plugin.cc index b89cd4627..f6d4956b3 100644 --- a/ext-profiler/example/plugin.c +++ b/ext-profiler/example/plugin.cc @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include #include @@ -22,12 +22,20 @@ static int initialized; // initialization counter for profiler static double startTime; // profiler start time static const int defaultEActivationMask = ncclProfileColl | ncclProfileP2p; -static const int defaultGroupPoolSize = 16; -static const int defaultCollPoolSize = 16; -static const int defaultP2pPoolSize = 1024; +static const int defaultGroupApiPoolSize = 256; +static const int defaultCollApiPoolSize = 256; +static const int defaultP2pApiPoolSize = 256; +static const int defaultKernelLaunchPoolSize = 256; +static const int defaultGroupPoolSize = 256; +static const int defaultCollPoolSize = 256; +static const int defaultP2pPoolSize = 256; static const int defaultProxyCtrlPoolSize = 16; -static const int defaultDetachPoolSize = 128; +static const int defaultDetachPoolSize = 256; +static int groupApiPoolSize; +static int collApiPoolSize; +static int p2pApiPoolSize; +static int kernelLaunchPoolSize; static int groupPoolSize; static int collPoolSize; static int p2pPoolSize; @@ -51,7 +59,7 @@ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; static pid_t pid; static int* eActivationMaskPtr; -__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) { +__hidden ncclResult_t exampleProfilerInit(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) { pthread_mutex_lock(&lock); if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) { // first thread initializes event mask, environment and detach pool @@ -59,6 +67,18 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask, str = getenv("NCCL_PROFILE_EVENT_MASK"); __atomic_store_n(eActivationMask, str ? atoi(str) : 0, __ATOMIC_RELAXED); + str = getenv("NCCL_PROFILE_GROUP_API_POOL_SIZE"); + groupApiPoolSize = str ? atoi(str) : defaultGroupApiPoolSize; + + str = getenv("NCCL_PROFILE_COLL_API_POOL_SIZE"); + collApiPoolSize = str ? atoi(str) : defaultCollApiPoolSize; + + str = getenv("NCCL_PROFILE_P2P_API_POOL_SIZE"); + p2pApiPoolSize = str ? atoi(str) : defaultP2pApiPoolSize; + + str = getenv("NCCL_PROFILE_KERNEL_LAUNCH_POOL_SIZE"); + kernelLaunchPoolSize = str ? atoi(str) : defaultKernelLaunchPoolSize; + str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE"); groupPoolSize = str ? atoi(str) : defaultGroupPoolSize; @@ -96,11 +116,23 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask, // pre-allocate memory for event object pools in dedicated profiler context struct context* ctx = (struct context *)calloc(1, sizeof(*ctx)); ctx->commName = commName; - ctx->commHash = commHash; + ctx->commHash = commId; ctx->nranks = nranks; ctx->rank = rank; logFn = logfn; - INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d", commName ? commName : "", commHash, nranks, rank); + INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d", commName ? commName : "", commId, nranks, rank); + + ctx->groupApiPool = (struct groupApi *)calloc(groupApiPoolSize, sizeof(*ctx->groupApiPool)); + if (ctx->groupApiPool == NULL) goto fail; + + ctx->collApiPool = (struct collApi *)calloc(collApiPoolSize, sizeof(*ctx->collApiPool)); + if (ctx->collApiPool == NULL) goto fail; + + ctx->p2pApiPool = (struct p2pApi *)calloc(p2pApiPoolSize, sizeof(*ctx->p2pApiPool)); + if (ctx->p2pApiPool == NULL) goto fail; + + ctx->kernelLaunchPool = (struct kernelLaunch *)calloc(kernelLaunchPoolSize, sizeof(*ctx->kernelLaunchPool)); + if (ctx->kernelLaunchPool == NULL) goto fail; ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool)); if (ctx->groupPool == NULL) goto fail; @@ -130,16 +162,22 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask, if (ctx->p2pPool) free(ctx->p2pPool); if (ctx->collPool) free(ctx->collPool); if (ctx->groupPool) free(ctx->groupPool); + if (ctx->collApiPool) free(ctx->collApiPool); + if (ctx->p2pApiPool) free(ctx->p2pApiPool); + if (ctx->kernelLaunchPool) free(ctx->kernelLaunchPool); + if (ctx->groupApiPool) free(ctx->groupApiPool); free(ctx); if (detachPool) free(detachPool); return ncclSystemError; } +static const char* profilerDumpFile; + __hidden ncclResult_t exampleProfilerFinalize(void* context) { FILE* fh = NULL; char filename[PATH_MAX] = { 0 }; struct context* ctx = (struct context *)context; - const char* dump = getenv("NCCL_PROFILE_DUMP_FILE"); + const char* dump = profilerDumpFile ? profilerDumpFile : getenv("NCCL_PROFILE_DUMP_FILE"); if (dump) { sprintf(filename, "%s_%lu_%d.json", dump, ctx->commHash, ctx->rank); fh = fopen(filename, "w"); @@ -148,10 +186,12 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) { INFO(NCCL_INIT, "PROFILER/Plugin: finalize commName: %s commHash: %lu nranks: %d rank: %d", ctx->commName ? ctx->commName : "", ctx->commHash, ctx->nranks, ctx->rank); // print last N groups/collectives/p2ps - int start = (ctx->groupPoolIndex - groupPoolSize >= 0) ? ctx->groupPoolIndex - groupPoolSize : 0; - int end = ctx->groupPoolIndex; + // Note that since the v5 version of the profiler, group API events are now at the top of the hierarchy. + // Legacy Group events from v4 are still emitted for compatibility purposes when using the v4 profiler but excluded from this example. + int start = (ctx->groupApiPoolIndex - groupApiPoolSize >= 0) ? ctx->groupApiPoolIndex - groupApiPoolSize : 0; + int end = ctx->groupApiPoolIndex; for (int i = start; i < end; i++) { - printEvent(fh, &ctx->groupPool[i%groupPoolSize]); + printEvent(fh, &ctx->groupApiPool[i%groupApiPoolSize]); } start = (ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize >= 0) ? ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize : 0; @@ -161,6 +201,10 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) { } free(ctx->groupPool); + free(ctx->collApiPool); + free(ctx->p2pApiPool); + free(ctx->kernelLaunchPool); + free(ctx->groupApiPool); free(ctx->collPool); free(ctx->p2pPool); free(ctx->proxyCtrlPool); @@ -187,7 +231,113 @@ __hidden void updateEvent(void* handle); __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) { *eHandle = NULL; struct context* ctx = (struct context *)context; - if (eDescr->type == ncclProfileGroup) { + if (eDescr->type == ncclProfileGroupApi) { + struct groupApi* event; + int groupApiId = __atomic_fetch_add(&ctx->groupApiPoolIndex, 1, __ATOMIC_RELAXED); + if ((groupApiId - __atomic_load_n(&ctx->groupApiPoolBase, __ATOMIC_RELAXED)) < groupApiPoolSize) { + // if there are available group API events grab one + event = &ctx->groupApiPool[groupApiId%groupApiPoolSize]; + // Make sure all child events of the picked group API event are cleared + while (!profilerQueueEmpty(&event->collApiEvents)) { + struct collApi *collApiEvent = profilerQueueDequeue(&event->collApiEvents); + resetTaskEvents(collApiEvent, ctx); + __atomic_fetch_add(&ctx->collApiPoolBase, 1, __ATOMIC_RELAXED); + } + while (!profilerQueueEmpty(&event->p2pApiEvents)) { + struct p2pApi *p2pApiEvent = profilerQueueDequeue(&event->p2pApiEvents); + resetTaskEvents(p2pApiEvent, ctx); + __atomic_fetch_add(&ctx->p2pApiPoolBase, 1, __ATOMIC_RELAXED); + } + while (!profilerQueueEmpty(&event->kernelLaunchEvents)) { + profilerQueueDequeue(&event->kernelLaunchEvents); + __atomic_fetch_add(&ctx->kernelLaunchPoolBase, 1, __ATOMIC_RELAXED); + } + } else { + // else drop this event + __atomic_fetch_sub(&ctx->groupApiPoolIndex, 1, __ATOMIC_RELAXED); + return ncclSuccess; + } + event->type = ncclProfileGroupApi; + event->ctx = ctx; + event->groupApiId = groupApiId; + event->graphCaptured = eDescr->groupApi.graphCaptured; + event->groupDepth = eDescr->groupApi.groupDepth; + event->startTs = gettime() - startTime; + *eHandle = event; + } else if (eDescr->type == ncclProfileCollApi) { + if (eDescr->parentObj == NULL) return ncclSuccess; + struct collApi* event; + int collApiId = __atomic_fetch_add(&ctx->collApiPoolIndex, 1, __ATOMIC_RELAXED); + if ((collApiId - __atomic_load_n(&ctx->collApiPoolBase, __ATOMIC_RELAXED)) < collApiPoolSize) { + // if there are available Coll API events grab one + event = &ctx->collApiPool[collApiId%collApiPoolSize]; + resetTaskEvents(event, ctx); + } else { + // else drop this event + __atomic_fetch_sub(&ctx->collApiPoolIndex, 1, __ATOMIC_RELAXED); + return ncclSuccess; + } + event->type = ncclProfileCollApi; + event->collApiId = collApiId; + event->ctx = ctx; + event->func = eDescr->collApi.func; + event->stream = (cudaStream_t) eDescr->collApi.stream; + event->count = eDescr->collApi.count; + event->datatype = eDescr->collApi.datatype; + event->root = eDescr->collApi.root; + event->graphCaptured = eDescr->collApi.graphCaptured; + struct groupApi* parent = (struct groupApi *) eDescr->parentObj; + event->parent = parent; + profilerQueueEnqueue(&parent->collApiEvents, event); + __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED); + *eHandle = event; + } else if (eDescr->type == ncclProfileP2pApi) { + if (eDescr->parentObj == NULL) return ncclSuccess; + struct p2pApi* event; + int p2pApiId = __atomic_fetch_add(&ctx->p2pApiPoolIndex, 1, __ATOMIC_RELAXED); + if ((p2pApiId - __atomic_load_n(&ctx->p2pApiPoolBase, __ATOMIC_RELAXED)) < p2pApiPoolSize) { + // if there are available p2p API events grab one + event = &ctx->p2pApiPool[p2pApiId%p2pApiPoolSize]; + resetTaskEvents(event, ctx); + } else { + // else drop this event + __atomic_fetch_sub(&ctx->p2pApiPoolIndex, 1, __ATOMIC_RELAXED); + return ncclSuccess; + } + event->type = ncclProfileP2pApi; + event->p2pApiId = p2pApiId; + event->ctx = ctx; + event->func = eDescr->p2pApi.func; + event->stream = (cudaStream_t) eDescr->p2pApi.stream; + event->count = eDescr->p2pApi.count; + event->datatype = eDescr->p2pApi.datatype; + event->graphCaptured = eDescr->p2pApi.graphCaptured; + struct groupApi* parent = (struct groupApi *) eDescr->parentObj; + event->parent = parent; + profilerQueueEnqueue(&parent->p2pApiEvents, event); + __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED); + *eHandle = event; + } else if (eDescr->type == ncclProfileKernelLaunch) { + if (eDescr->parentObj == NULL) return ncclSuccess; + struct kernelLaunch* event; + int kernelLaunchId = __atomic_fetch_add(&ctx->kernelLaunchPoolIndex, 1, __ATOMIC_RELAXED); + if ((kernelLaunchId - __atomic_load_n(&ctx->kernelLaunchPoolBase, __ATOMIC_RELAXED)) < kernelLaunchPoolSize) { + // if there are available kernel API events grab one + event = &ctx->kernelLaunchPool[kernelLaunchId%kernelLaunchPoolSize]; + } else { + // else drop this event + __atomic_fetch_sub(&ctx->kernelLaunchPoolIndex, 1, __ATOMIC_RELAXED); + return ncclSuccess; + } + event->type = ncclProfileKernelLaunch; + event->stream = (cudaStream_t) eDescr->kernelLaunch.stream; + struct groupApi* parent = (struct groupApi *) eDescr->parentObj; + event->parent = parent; + profilerQueueEnqueue(&parent->kernelLaunchEvents, event); + __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED); + *eHandle = event; + } else if (eDescr->type == ncclProfileGroup) { + if (eDescr->parentObj == NULL) return ncclSuccess; struct group* event; int groupId = __atomic_fetch_add(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED); if ((groupId - __atomic_load_n(&ctx->groupPoolBase, __ATOMIC_RELAXED)) < groupPoolSize) { @@ -222,7 +372,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n debugEvent(event, "GroupStart"); } else if (eDescr->type == ncclProfileColl) { // the parent might be null if we run out of events - struct group* parent = (struct group *)eDescr->parentObj; + struct collApi* parent = (struct collApi *)eDescr->parentObj; if (parent == NULL) return ncclSuccess; struct collective* event; @@ -253,12 +403,12 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n event->proto = eDescr->coll.proto; *eHandle = event; taskEventQueueEnqueue(parent, (struct taskEventBase *)event); - // increment the group ref counter so the event will staty open + // increment the group ref counter so the event will stay open __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED); debugEvent(event, "CollStart"); } else if (eDescr->type == ncclProfileP2p) { // the parent might be null if we run out of events - struct group* parent = (struct group *)eDescr->parentObj; + struct p2pApi* parent = (struct p2pApi*) eDescr->parentObj; if (parent == NULL) return ncclSuccess; struct p2p* event; @@ -458,8 +608,34 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n } void updateEvent(void* handle) { - uint8_t type = *(uint8_t *)handle; - if (type == ncclProfileGroup) { + uint64_t type = *(uint64_t *)handle; + if (type == ncclProfileGroupApi) { + struct groupApi* event = (struct groupApi*) handle; + if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) { + event->stopTs = gettime() - startTime; + __atomic_fetch_add(&event->ctx->groupApiPoolBase, 1, __ATOMIC_RELAXED); + } + } else if (type == ncclProfileCollApi) { + struct collApi* event = (struct collApi*) handle; + if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) { + event->stopTs = gettime() - startTime; + __atomic_fetch_add(&event->ctx->collApiPoolBase, 1, __ATOMIC_RELAXED); + } + updateEvent(event->parent); + return; + } else if (type == ncclProfileP2pApi) { + struct p2pApi* event = (struct p2pApi*) handle; + if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) { + event->stopTs = gettime() - startTime; + __atomic_fetch_add(&event->ctx->p2pApiPoolBase, 1, __ATOMIC_RELAXED); + } + updateEvent(event->parent); + event->stopTs = gettime() - startTime; + } else if (type == ncclProfileKernelLaunch) { + struct kernelLaunch* event = (struct kernelLaunch*) handle; + event->stopTs = gettime() - startTime; + updateEvent(event->parent); + } else if (type == ncclProfileGroup) { struct group* event = (struct group *)handle; if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) { event->stopTs = gettime() - startTime; @@ -527,25 +703,35 @@ __hidden ncclResult_t exampleProfilerStopEvent(void* eHandle) { // the event handle might be null if we run out of events if (eHandle == NULL) return ncclSuccess; - uint8_t type = *(uint8_t *)eHandle; - if (type == ncclProfileGroup) { - // stopping the group event in NCCL core does not - // mean the group has completed. It means the group - // was submitted/enqueued so we need to keep the event open + uint64_t type = *(uint64_t *)eHandle; + // Stopping API events, Kernel Launch events, collective/p2p task events + // in NCCL core do not mean that they are complete. It means that the + // operation was enqueued so we need to keep the events open + if (type == ncclProfileGroupApi) { + struct groupApi* event = (struct groupApi*) eHandle; + event->stopTs = gettime() - startTime; + return ncclSuccess; + } else if (type == ncclProfileCollApi) { + struct collApi* event = (struct collApi*) eHandle; + event->stopTs = gettime() - startTime; + return ncclSuccess; + } else if (type == ncclProfileP2pApi) { + struct p2pApi* event = (struct p2pApi*) eHandle; + event->stopTs = gettime() - startTime; + return ncclSuccess; + } else if (type == ncclProfileKernelLaunch) { + struct kernelLaunch* event = (struct kernelLaunch*) eHandle; + event->stopTs = gettime() - startTime; + return ncclSuccess; + } else if (type == ncclProfileGroup) { struct group* event = (struct group *)eHandle; event->stopTs = gettime() - startTime; return ncclSuccess; } else if (type == ncclProfileColl) { - // stopping the collective event in NCCL core does not - // mean the collective has completed. It means the collective - // was submitted/enqueued so we need to keep the event open struct collective* event = (struct collective *)eHandle; event->base.stopTs = gettime() - startTime; return ncclSuccess; } else if (type == ncclProfileP2p) { - // stopping the p2p event in NCCL core does not - // mean the p2p has completed. It means the p2p - // was submitted/enqueued so we need to keep the event open struct p2p* event = (struct p2p *)eHandle; event->base.stopTs = gettime() - startTime; return ncclSuccess; @@ -559,8 +745,15 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile // the event handle might be null if we run out of events if (eHandle == NULL) return ncclSuccess; - uint8_t type = *(uint8_t *)eHandle; - if (type == ncclProfileProxyOp) { + uint64_t type = *(uint64_t *)eHandle; + if (type == ncclProfileGroupApi) { + struct groupApi* event = (struct groupApi*) eHandle; + if (eState == ncclProfilerEndGroupApiStart) { + event->endOfncclGroupStartTs = gettime() - startTime; + } else if (eState == ncclProfilerBeginGroupApiEnd) { + event->startOfncclGroupEndTs = gettime() - startTime; + } + } else if (type == ncclProfileProxyOp) { struct proxyOp* event = (struct proxyOp *)eHandle; if (eState == ncclProfilerProxyOpInProgress_v4) { event->progrTs = gettime() - startTime; @@ -592,6 +785,8 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile case ncclProfilerProxyStepRecvGPUWait: event->timestamp[PROXY_STEP_RECV_GPU_WAIT] = gettime() - startTime; break; + default: + break; } } else if (type == ncclProfileProxyCtrl) { struct proxyCtrl* event = (struct proxyCtrl *)eHandle; @@ -609,7 +804,7 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile return ncclSuccess; } -ncclProfiler_t ncclProfiler_v4 = { +ncclProfiler_t ncclProfiler_v5 = { "Example-profiler", exampleProfilerInit, exampleProfilerStartEvent, @@ -618,14 +813,15 @@ ncclProfiler_t ncclProfiler_v4 = { exampleProfilerFinalize, }; -int exampleProfilerStart(int eActivationMask) { +__attribute__((visibility("default"))) int exampleProfilerStart(int eActivationMask, const char* name) { + profilerDumpFile = name; if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) { __atomic_store_n(eActivationMaskPtr, eActivationMask, __ATOMIC_RELAXED); } return ncclSuccess; } -int exampleProfilerStop(void) { +__attribute__((visibility("default"))) int exampleProfilerStop(void) { if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) { __atomic_store_n(eActivationMaskPtr, 0, __ATOMIC_RELAXED); } diff --git a/ext-profiler/example/plugin.h b/ext-profiler/example/plugin.h index b4d07060a..9248ebf08 100644 --- a/ext-profiler/example/plugin.h +++ b/ext-profiler/example/plugin.h @@ -7,7 +7,8 @@ #ifndef PLUGIN_H_ #define PLUGIN_H_ -int exampleProfilerStart(int eActivationMask); -int exampleProfilerStop(void); +__attribute__((visibility("default"))) int exampleProfilerStart(int eActivationMask, const char* name); +__attribute__((visibility("default"))) int exampleProfilerStop(void); + #endif diff --git a/ext-profiler/example/print_event.c b/ext-profiler/example/print_event.cc similarity index 76% rename from ext-profiler/example/print_event.c rename to ext-profiler/example/print_event.cc index a56106e10..ca3c7cfae 100644 --- a/ext-profiler/example/print_event.c +++ b/ext-profiler/example/print_event.cc @@ -5,15 +5,59 @@ ************************************************************************/ #include +#include "err.h" #include "profiler.h" #include "event.h" #include "print_event.h" +#include #define __hidden __attribute__ ((visibility("hidden"))) // FIXME: chrome tracing asynchronous events (following used) allow event nesting for events that have same id and category // It appears that nesting more than three events causes issues. Therefore, every event is given an increasing id and a -// category that matches the type of event (GROUP, COLL, P2P, PROXY, NET) +// category that matches the type of event (GROUP API, COLL API, P2P API, GROUP, COLL, P2P, PROXY, NET) +static __thread int groupApiId; +__hidden void printGroupApiEventHeader(FILE* fh, struct groupApi* event) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupApiId\": %d, \"groupDepth\":%d}},\n", + "Group API", groupApiId, getpid(), 1, event->startTs, event->groupApiId, event->groupDepth); +} + +__hidden void printGroupApiEventTrailer(FILE* fh, struct groupApi* event) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + "Group API", groupApiId++, getpid(), 1, event->stopTs); +} + +static __thread int p2pApiId; +__hidden void printP2pApiEventHeader(FILE* fh, struct p2pApi* event) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"count\": %lu, \"datatype\": %s, \"GraphCaptured\":%d, \"Stream\": %p}},\n", + event->func, p2pApiId, getpid(), 1, event->startTs, event->count, event->datatype, event->graphCaptured, event->stream); +} + +__hidden void printP2pApiEventTrailer(FILE* fh, struct p2pApi* event) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + event->func, p2pApiId++, getpid(), 1, event->stopTs); +} + +static __thread int collApiId; +__hidden void printCollApiEventHeader(FILE* fh, struct collApi* event) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"count\": %lu, \"datatype\": %s, \"root\": %d, \"GraphCaptured\":%d, \"Stream\": %p}},\n", + event->func, collApiId, getpid(), 1, event->startTs, event->count, event->datatype, event->root, event->graphCaptured, event->stream); +} + +__hidden void printCollApiEventTrailer(FILE* fh, struct collApi* event) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", + event->func, collApiId++, getpid(), 1, event->stopTs); +} + +static __thread int kernelLaunchId; +__hidden void printKernelLaunchEventHeader(FILE* fh, struct kernelLaunch* event) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"KERNEL_LAUNCH\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d, \"Stream\": %p}},\n", "KernelLaunch", kernelLaunchId, getpid(), 1, event->startTs, event->kernelLaunchId, event->stream); +} + +__hidden void printKernelLaunchEventTrailer(FILE* fh, struct kernelLaunch* event) { + fprintf(fh, "{\"name\": \"%s\", \"cat\": \"KERNEL_LAUNCH\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", "KernelLaunch", kernelLaunchId++, getpid(), 1, event->stopTs); +} + static __thread int groupId; __hidden void printGroupEventHeader(FILE* fh, struct group* event) { fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d}},\n", @@ -28,7 +72,7 @@ __hidden void printGroupEventTrailer(FILE* fh, struct group* event) { static __thread int collId; __hidden void printCollEventHeader(FILE* fh, struct collective* event) { fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nChannels\": %d}},\n", - event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.parent->ctx->commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nChannels); + event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, ((struct collApi*)event->base.parent)->ctx->commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nChannels); } __hidden void printCollEventTrailer(FILE* fh, struct collective* event) { @@ -39,7 +83,7 @@ __hidden void printCollEventTrailer(FILE* fh, struct collective* event) { static __thread int p2pId; __hidden void printP2pEventHeader(FILE* fh, struct p2p* event) { fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"nChannels\": %d}},\n", - event->base.func, p2pId, getpid(), 1, event->base.startTs, event->base.parent->ctx->commHash, event->base.rank, event->peer, event->count, event->datatype, event->nChannels); + event->base.func, p2pId, getpid(), 1, event->base.startTs, ((struct p2pApi*)event->base.parent)->ctx->commHash, event->base.rank, event->peer, event->count, event->datatype, event->nChannels); } __hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) { @@ -173,7 +217,7 @@ void debugEvent(void* eHandle, const char* tag) { char filename[64] = { 0 }; sprintf(filename, "EventDebug-%d", getpid()); FILE* fh = fopen(filename, "a+"); - uint8_t type = *(uint8_t *)eHandle; + uint64_t type = *(uint64_t *)eHandle; if (type == ncclProfileGroup) { struct group* event = (struct group *)eHandle; fprintf(fh, "Group event %p tag = %s {\n", event, tag); @@ -241,8 +285,51 @@ void debugEvent(void* eHandle, const char* tag) { void printEvent(FILE* fh, void* handle) { if (handle == NULL || fh == NULL) return; - uint8_t type = *(uint8_t *)handle; - if (type == ncclProfileGroup) { + uint64_t type = *(uint64_t *)handle; + if (type == ncclProfileGroupApi) { + struct groupApi* g = (struct groupApi*) handle; + printGroupApiEventHeader(fh, g); + struct kernelLaunch* kernelLaunchHead = profilerQueueHead(&g->kernelLaunchEvents); + while (kernelLaunchHead != NULL) { + printEvent(fh, kernelLaunchHead); + kernelLaunchHead = kernelLaunchHead->next; + } + struct collApi* collApiHead = profilerQueueHead(&g->collApiEvents); + while (collApiHead != NULL) { + printEvent(fh, collApiHead); + collApiHead = collApiHead->next; + } + struct p2pApi* p2pApiHead = profilerQueueHead(&g->p2pApiEvents); + while (p2pApiHead != NULL) { + printEvent(fh, p2pApiHead); + p2pApiHead = p2pApiHead->next; + } + printGroupApiEventTrailer(fh, g); + } else if (type == ncclProfileCollApi) { + struct collApi* collApiEvent = (struct collApi *) handle; + printCollApiEventHeader(fh, collApiEvent); + struct taskEventBase* base = taskEventQueueHead(collApiEvent); + while (base) { + struct taskEventBase* next = base->next; + printEvent(fh, base); + base = next; + } + printCollApiEventTrailer(fh, collApiEvent); + } else if (type == ncclProfileP2pApi) { + struct p2pApi* p2pApiEvent = (struct p2pApi *) handle; + printP2pApiEventHeader(fh, p2pApiEvent); + struct taskEventBase* base = taskEventQueueHead(p2pApiEvent); + while (base) { + struct taskEventBase* next = base->next; + printEvent(fh, base); + base = next; + } + printP2pApiEventTrailer(fh, p2pApiEvent); + } else if (type == ncclProfileKernelLaunch) { + struct kernelLaunch* kernelLaunchEvent = (struct kernelLaunch *) handle; + printKernelLaunchEventHeader(fh, kernelLaunchEvent); + printKernelLaunchEventTrailer(fh, kernelLaunchEvent); + } else if (type == ncclProfileGroup) { struct group* g = (struct group *)handle; printGroupEventHeader(fh, g); struct taskEventBase* base = taskEventQueueHead(g); diff --git a/ext-profiler/example/queue.h b/ext-profiler/example/queue.h new file mode 100644 index 000000000..dfb14f575 --- /dev/null +++ b/ext-profiler/example/queue.h @@ -0,0 +1,50 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#ifndef QUEUE_H +#define QUEUE_H + +template +struct profilerQueue { + T *head, *tail; +}; + +template + inline void profilerQueueConstruct(profilerQueue *me) { + me->head = nullptr; + me->tail = nullptr; +} + +template + inline bool profilerQueueEmpty(profilerQueue *me) { + return me->head == nullptr; +} + +template +inline T* profilerQueueHead(profilerQueue *me) { + return me->head; +} + +template + inline T* profilerQueueTail(profilerQueue *me) { + return me->tail; +} + +template + inline void profilerQueueEnqueue(profilerQueue *me, T *x) { + x->*next = nullptr; + (me->head ? me->tail->*next : me->head) = x; + me->tail = x; +} + +template + inline T* profilerQueueDequeue(profilerQueue *me) { + T *ans = me->head; + me->head = ans->*next; + if (me->head == nullptr) me->tail = nullptr; + return ans; +} + +#endif diff --git a/ext-profiler/google-CoMMA/Makefile b/ext-profiler/google-CoMMA/Makefile new file mode 100644 index 000000000..2da516990 --- /dev/null +++ b/ext-profiler/google-CoMMA/Makefile @@ -0,0 +1,22 @@ +.PHONY: build-CoMMA + +all: build-CoMMA + +build-CoMMA: clone-CoMMA + cd CoMMA && cargo build + +clone-CoMMA: + @if [ ! -d CoMMA ] ; then \ + git clone https://github.com/google/CoMMA.git; \ + ln -s $(PWD)/.. CoMMA/third_party/nccl/ext-profiler; \ + fi + +clean: + @if [ -d CoMMA ] ; then \ + cd CoMMA && cargo clean; \ + fi + +delete: + @if [ -d CoMMA ] ; then \ + rm -rf CoMMA; \ + fi diff --git a/ext-profiler/inspector/Makefile b/ext-profiler/inspector/Makefile new file mode 100644 index 000000000..301c46b20 --- /dev/null +++ b/ext-profiler/inspector/Makefile @@ -0,0 +1,62 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +# Variables +NCCL_HOME := ../../build +INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl +PLUGIN_SO := libnccl-profiler-inspector.so +VERSION_FILE := version.cc + +# Compiler and flags +CXX := g++ +CXXFLAGS := -g -O3 -fPIC -shared -march=native -DNDEBUG -Wall -Wextra + +ifeq ($(DEBUG), 1) +CXXFLAGS += -g2 -ggdb3 -rdynamic -funwind-tables -fno-omit-frame-pointer +endif + +ifeq ($(ASAN), 1) +CXXFLAGS += -fsanitize=address +LDFLAGS += -fsanitize=address -static-libasan +NVLDFLAGS += -Xcompiler -fsanitize=address,-static-libasan +endif + +ifeq ($(UBSAN), 1) +CXXFLAGS += -fsanitize=undefined +LDFLAGS += -fsanitize=undefined -static-libubsan +NVLDFLAGS += -Xcompiler -fsanitize=undefined,-static-libubsan +endif + +# Source files +SOURCES := inspector_plugin.cc inspector.cc json.cc + +# Default target +all: $(PLUGIN_SO) + +# Rule to build the plugin +$(PLUGIN_SO): $(VERSION_FILE) $(SOURCES) + @echo "Compiling to create $@ from $^" + $(CXX) $(INC) $(CXXFLAGS) -o $@ -Wl,-soname,$(PLUGIN_SO) $^ + +# Rule to generate version.cc +$(VERSION_FILE): + @GIT_INFO=$$(./utils/extract_git_version.sh); \ + echo '#include "version.h"' > $(VERSION_FILE).tmp; \ + echo 'const char* get_git_version_info() { return "'$$GIT_INFO'"; }' >> $(VERSION_FILE).tmp; \ + if ! cmp $(VERSION_FILE).tmp $(VERSION_FILE); then \ + echo "updating ${VERSION_FILE} file -> $$GIT_INFO"; \ + mv $(VERSION_FILE).tmp $(VERSION_FILE); \ + else \ + echo "${VERSION_FILE} up to date -> $$GIT_INFO"; \ + rm $(VERSION_FILE).tmp; \ + fi + +# Clean target +clean: + rm -f $(VERSION_FILE) $(PLUGIN_SO) + +# Phony targets +.PHONY: all clean diff --git a/ext-profiler/inspector/README.md b/ext-profiler/inspector/README.md new file mode 100644 index 000000000..daf26f7dd --- /dev/null +++ b/ext-profiler/inspector/README.md @@ -0,0 +1,216 @@ +# NCCL Inspector Plugin + +The NCCL Inspector is a plugin for the NVIDIA Collective Communications Library (NCCL) that provides detailed, per-communicator, per-collective performance and metadata logging. It is designed to help users analyze and debug NCCL collective operations by generating structured JSON output for each operation. + +## Related Documentation + +- **[Performance Exporter](exporter/example/README.md)** - Tool for analyzing and visualizing NCCL performance data from inspector logs + +## Folder Location + +The Inspector plugin source is located in: + +``` +ext-profiler/inspector/ +``` + +## Building the Inspector Plugin + +To build the Inspector plugin, run: + +```bash +make +``` + +The build system will automatically detect CUDA and NCCL installations from your environment. If you need to specify custom paths, you can set `CUDA_HOME` and `NCCL_HOME` environment variables or pass them as make arguments. + +### Build Options + +The Makefile supports several build options: + +- **DEBUG=1**: Enable debug build with additional debugging information +- **ASAN=1**: Enable Address Sanitizer for memory error detection +- **UBSAN=1**: Enable Undefined Behavior Sanitizer + +Example debug build: +```bash +make DEBUG=1 +``` + +### Build Output + +The build process creates: +- `libnccl-profiler-inspector.so`: The main inspector plugin library +- `version.cc`: Auto-generated version information from git + +## Using NCCL Inspector + +### Key Differences from Normal NCCL Usage + +The main difference between running NCCL with the Inspector plugin versus running NCCL normally is the addition of environment variables that enable detailed performance logging: + +**Normal NCCL Run:** +```bash +# Standard NCCL execution +./your_nccl_application +``` + +**NCCL Inspector Run:** +```bash +# NCCL Inspector enabled execution +export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so +export NCCL_INSPECTOR_ENABLE=1 +export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500 +./your_nccl_application +``` + +### Required Environment Variables + +- `NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so` + Loads the Inspector plugin into NCCL. +- `NCCL_INSPECTOR_ENABLE=1` + Enables the Inspector plugin. +- `NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=` + Sets the interval (in microseconds) for the internal dump thread to write output. Example: `500`. +- `NCCL_INSPECTOR_DUMP_DIR=` (optional) + Sets the output directory for logs. If not set, defaults to `nccl-inspector-unknown-jobid` or `nccl-inspector-` if running under SLURM. +- `NCCL_INSPECTOR_DUMP_VERBOSE=<0|1>` (optional) + Enables verbose output including event trace information. Set to `1` to enable, `0` to disable (default). + +### Example Usage + +**Single Node:** +```bash +export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so +export NCCL_INSPECTOR_ENABLE=1 +export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500 +./build/test/perf/all_reduce_perf -b 8 -e 16G -f 2 -g 8 +``` + +**Multi-Node (SLURM):** +```bash +# Add these environment variables to your SLURM script +export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so +export NCCL_INSPECTOR_ENABLE=1 +export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500 +export NCCL_INSPECTOR_DUMP_DIR=/path/to/logs/${SLURM_JOB_ID}/ + +# Then run your normal NCCL application +srun your_nccl_application +``` + +## Example Scripts + +For detailed example scripts showing how to integrate NCCL Inspector with different workloads, see the **[test/examples/](test/examples/)** directory: + +- **Single Node Example**: Basic NCCL performance testing with inspector +- **Multi-Node SLURM Example**: Comprehensive multi-node testing with various collective operations +- **Training Workload Example**: Integration with distributed training workloads + +## Output Example + +Each output file contains JSON objects with the following structure: + +```json +{ + "header": { + "id": "0x7f8c496ae9f661", + "rank": 2, + "n_ranks": 8, + "nnodes": 1 + }, + "metadata": { + "inspector_output_format_version": "v4.0", + "git_rev": "", + "rec_mechanism": "profiler_plugin", + "dump_timestamp_us": 1748030377748202, + "hostname": "example-hostname", + "pid": 1639453 + }, + "coll_perf": { + "coll": "AllReduce", + "coll_sn": 1407, + "coll_msg_size_bytes": 17179869184, + "coll_exec_time_us": 61974, + "coll_algobw_gbs": 277.210914, + "coll_busbw_gbs": 485.119099 + } +} +``` + +## Output Example Verbose + +To enable verbose output with event trace information, set the `NCCL_INSPECTOR_DUMP_VERBOSE=1` environment variable: + +```bash +export NCCL_INSPECTOR_DUMP_VERBOSE=1 +``` + +This will include additional event trace information in the JSON output, showing the sequence of callbacks and timestamps for each individual event. + +```json +{ + "header": { + "id": "0xe62dedaa97644a", + "rank": 4, + "n_ranks": 8, + "nnodes": 1 + }, + "metadata": { + "inspector_output_format_version": "v4.0", + "git_rev": "9019a1912-dirty", + "rec_mechanism": "nccl_profiler_interface", + "dump_timestamp_us": 1752867229276385, + "hostname": "example-hostname", + "pid": 438776 + }, + "coll_perf": { + "coll": "ReduceScatter", + "coll_sn": 1231, + "coll_msg_size_bytes": 2147483648, + "coll_exec_time_us": 41057, + "coll_timing_source": "kernel_gpu", + "coll_algobw_gbs": 418.439467, + "coll_busbw_gbs": 366.134533, + "event_trace_sn": { + "coll_start_sn": 1, + "coll_stop_sn": 2, + "kernel_events": [ + { + "channel_id": 0, + "kernel_start_sn": 3, + "kernel_stop_sn": 48, + "kernel_record_sn": 47 + } + ] + }, + "event_trace_ts": { + "coll_start_ts": 1752867229235059, + "coll_stop_ts": 1752867229235064, + "kernel_events": [ + { + "channel_id": 0, + "kernel_start_ts": 1752867229235181, + "kernel_stop_ts": 1752867229275811, + "kernel_record_ts": 1752867229275811 + } + ] + } + } +} +``` + +Multiple such JSON objects are written, one per collective operation per communicator. + +## Output Directory + +- By default, output files are written to: + - `nccl-inspector-unknown-jobid` (if no SLURM job ID is present) + - `nccl-inspector-` (if running under SLURM) +- You can override this with the `NCCL_INSPECTOR_DUMP_DIR` environment variable. + +## Additional Notes + +- The plugin is compatible with standard NCCL workflows and can be used in both single-node and multi-node (SLURM) environments. +- For more details, see the source code and comments in `ext-profiler/inspector/`. + diff --git a/ext-profiler/inspector/exporter/example/README.md b/ext-profiler/inspector/exporter/example/README.md new file mode 100644 index 000000000..26e4b2e57 --- /dev/null +++ b/ext-profiler/inspector/exporter/example/README.md @@ -0,0 +1,151 @@ +# NCCL Inspector Performance Summary Exporter + +This tool processes NCCL Inspector log files and generates comprehensive performance analysis reports including visualizations and statistical summaries. +One can build similar exporters to integrate with various observability systems like Elastic, Prometheus or other Custom Metric systems. + +## Features + +- **Performance Analysis**: Generates statistical summaries for collective operations +- **Communication Type Classification**: Automatically categorizes communication patterns +- **Visualizations**: Creates scatter plots, histograms, and box plots for performance metrics +- **Data Export**: Converts logs to Parquet format for efficient processing +- **Multi-format Log Support**: Processes `.log`, `.log.gz`, `.jsonl`, and `.jsonl.gz` files +- **Parallel Processing**: Utilizes multi-core processing for faster analysis + +## Requirements + +- Python 3.7+ +- Access to NCCL Inspector log files + +## Installation + +### Clone the Repository + +```bash +git clone https://github.com/NVIDIA/nccl.git +cd nccl/ext-profiler/inspector/exporter/example +``` + +Install the required dependencies using the provided `requirements.txt` file: + +```bash +pip install -r requirements.txt +``` + +## Usage + +The script processes NCCL Inspector log files from a specified directory. + +**Note:** To generate NCCL Inspector log files, you need to run your NCCL application with the inspector plugin enabled. The log files will be output to a directory specified by the `NCCL_INSPECTOR_DUMP_DIR` environment variable. For detailed setup instructions and environment variable configuration, see the [Inspector README](../../../README.md). + +### Basic Usage + +```bash +python perf_summary_exporter.py --input_dir /path/to/nccl/inspector/logs +``` + +This mode processes all log files in the specified directory and its subdirectories recursively. + +### Command Line Arguments + +- `--input_dir `: **Required**. Directory containing NCCL Inspector log files (searches recursively in subdirectories) +- `--output_dir `: **Optional**. Custom output directory name (default: `-analysis`) + +## Output + +The tool generates: + +1. **Parquet Files**: One per log file containing processed log data (stored in `parquet_files/` subdirectory) +2. **Summary Directory**: Contains comprehensive analysis results +3. **Visualizations**: Scatter plots, histograms, and box plots for each message size +4. **CSV Files**: Detailed summaries for each message size and collective type +5. **Log File**: Processing log with detailed information + +## Example Output Structure + +``` +/ +├── output.log +├── parquet_files/ +│ ├── .parquet +│ ├── .parquet +│ └── ... +└── summary/ + ├── scatter_plot__.png + ├── combined_scatter_plot__.png + └── msg_size_/ + ├── histograms/ + │ └── histogram___.png + ├── boxplots/ + │ └── boxplot___.png + └── summary___.csv +``` + +## Supported Communicator Types + +- `single-rank` +- `nvlink-only` +- `hca-only` +- `mixed` + +## Supported Collective Types + +- `AllReduce` +- `AllGather` +- `ReduceScatter` +- `Broadcast` + +## Log File Formats + +### Supported Formats + +- `.log` - Plain text JSON lines +- `.log.gz` - Compressed JSON lines +- `.jsonl` - JSON lines format +- `.jsonl.gz` - Compressed JSON lines + +### Expected JSON Structure + +```json +{ + "header": { + "id": "0x9e7a479f95a66c", + "rank": 31, + "n_ranks": 32, + "nnodes": 4 + }, + "metadata": { + "inspector_output_format_version": "v4.0", + "git_rev": "75e61acda-dirty", + "rec_mechanism": "nccl_profiler_interface", + "dump_timestamp_us": 1749490229087081, + "hostname": "example-hostname", + "pid": 468528 + }, + "coll_perf": { + "coll": "ReduceScatter", + "coll_sn": 129, + "coll_msg_size_bytes": 65536, + "coll_exec_time_us": 110, + "coll_timing_source": "kernel_gpu", + "coll_algobw_gbs": 19.065018, + "coll_busbw_gbs": 18.469236 + } +} +``` + +## Troubleshooting + +### Common Issues + +1. **No log files found**: Ensure the log directory path is correct and contains valid log files +2. **Missing dependencies**: Ensure all requirements are installed in your virtual environment +3. **Mixed file formats**: The tool will exit if it detects mixed `.log`, `.log.gz`, `.jsonl`, and `.jsonl.gz` files in the same directory. This is typically indicative of corrupt input directories caused by multiple overlapping NCCL Inspector runs with different output format options. Clean the directory and re-run with consistent settings. + +### Log Files + +The tool creates detailed logs in the output directory. Check `output.log` for processing information and any error messages. + +## Support + +Please refer to the github issues page at https://github.com/NVIDIA/nccl/issues. Your question may already have been asked by another user. If not, feel free to create a new issue and refer to the "inspector plugin" in the title. diff --git a/ext-profiler/inspector/exporter/example/perf_summary_exporter.py b/ext-profiler/inspector/exporter/example/perf_summary_exporter.py new file mode 100644 index 000000000..5913152ce --- /dev/null +++ b/ext-profiler/inspector/exporter/example/perf_summary_exporter.py @@ -0,0 +1,548 @@ +from pathlib import Path +import argparse +import glob +import gzip +import sys +import pandas as pd +from concurrent.futures import ProcessPoolExecutor +import json +from tqdm.auto import tqdm +import duckdb +import math +import matplotlib.pyplot as plt +import matplotlib.dates +from matplotlib.gridspec import GridSpec +import os +import logging +import contextlib +from datetime import datetime +import numpy as np + +def setup_logging(output_dir): + log_file = output_dir / "output.log" + logging.basicConfig( + filename=log_file, + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + ) + + +@contextlib.contextmanager +def smart_open(filename, mode="r"): + if filename.endswith(".gz"): + opener = gzip.open + else: + opener = open + + with opener(filename, mode) as f: + yield f + + +def get_log_files_and_output_dir(): + parser = argparse.ArgumentParser(description="Process log files in a directory.") + parser.add_argument( + "--input_dir", + type=str, + help="The directory containing NCCL Inspector log files to process.", + ) + parser.add_argument( + "--output_dir", + type=str, + help="Custom output directory name (default: auto-generated from input directory)." + ) + args = parser.parse_args() + + if args.input_dir: + # Use the provided input directory + root_dir = Path(args.input_dir) + if not root_dir.exists(): + raise FileNotFoundError(f"Input directory not found: {root_dir}") + + logfiles = list(glob.iglob(str(Path(root_dir) / "**" / "*.log"), recursive=True)) + gzlogfiles = list( + glob.iglob(str(Path(root_dir) / "**" / "*.log.gz"), recursive=True) + ) + jsonlfiles = list( + glob.iglob(str(Path(root_dir) / "**" / "*.jsonl"), recursive=True) + ) + gzjsonlfiles = list( + glob.iglob(str(Path(root_dir) / "**" / "*.jsonl.gz"), recursive=True) + ) + if ( + sum((1 for x in [logfiles, gzlogfiles, jsonlfiles, gzjsonlfiles] if len(x) > 0)) + > 1 + ): + ### TODO: we could probably generate some logic to pick the "right" file to load, but for now, bail + logging.critical("Appear to have mixed .log/.log.gz/.jsonl/.jsonl.gz; bailing!") + sys.exit(1) + + files = logfiles + gzlogfiles + jsonlfiles + gzjsonlfiles + + if not files: + print("No inspector logs found") + sys.exit(1) + + # Generate output directory name from input directory + if args.output_dir: + output_dir_name = args.output_dir + else: + output_dir_name = f"{root_dir.name}-analysis" + + return files, output_dir_name + +def bytes_to_human_readable(size_bytes): + """ + Convert bytes to human-readable format using decimal (SI) units. + + Uses powers of 1000 (decimal/SI standard): + - 1 KB = 1,000 bytes + - 1 MB = 1,000,000 bytes + - 1 GB = 1,000,000,000 bytes + + Not binary units (powers of 1024): + - Does NOT use KiB, MiB, GiB (1024-based) + + Args: + size_bytes: Number of bytes to convert + + Returns: + Human-readable string (e.g., "1.50MB", "2.34GB") + """ + if size_bytes == 0: + return "0B" + size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") + i = int(math.log10(int(size_bytes)) / 3) + s = round(size_bytes * math.pow(10, -3 * i), 2) + return f"{s:.2f}{size_name[i]}" + +def timestamp_to_datetime(timestamp_us): + """Convert microsecond timestamp to datetime string""" + return datetime.fromtimestamp(timestamp_us / 1000000).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] + +def microseconds_to_human_readable(microseconds): + """Convert microseconds to human readable format""" + if microseconds < 1000: + return f"{microseconds:.1f}μs" + elif microseconds < 1000000: + return f"{microseconds/1000:.1f}ms" + else: + return f"{microseconds/1000000:.1f}s" + +def get_comm_type(row) -> str: + if row["n_ranks"] == 1: + return "single-rank" + elif row["nnodes"] == 1: + return "nvlink-only" + elif row["n_ranks"] == row["nnodes"]: + return "hca-only" + else: + return "mixed" + +def parse_file(filepath: Path, output_dir): + filename = Path(filepath).stem + parquet_file = output_dir / f"{filename}.parquet" + + # Check if parquet file exists and is newer than source file + if parquet_file.exists(): + source_mtime = Path(filepath).stat().st_mtime + parquet_mtime = parquet_file.stat().st_mtime + if parquet_mtime >= source_mtime: + logging.info(f"Parquet file {parquet_file} is up to date. Skipping...") + return + else: + logging.info(f"Source file {filepath} is newer than parquet. Regenerating...") + + # Check if file is empty or too small + file_size = Path(filepath).stat().st_size + if file_size == 0: + logging.warning(f"Skipping empty file: {filepath}") + return + + recs = [] + try: + with smart_open(filepath, "r") as infile: + for lineno, line in enumerate(infile): + try: + json_recs = json.loads(line) + except json.JSONDecodeError: + logging.error(f"Failed to parse line {filepath}:{lineno}") + continue + + # Validate that required fields exist + if not all(key in json_recs for key in ["header", "metadata", "coll_perf"]): + logging.error(f"Missing required fields in {filepath}:{lineno}") + continue + + header = json_recs["header"] + metadata = json_recs["metadata"] + comm_type = get_comm_type(header) + coll_perf = json_recs["coll_perf"] + recs.append( + dict( + **header, + comm_type=comm_type, + **coll_perf, + **metadata, + ) + ) + except Exception as e: + logging.error(f"Error reading file {filepath}: {e}") + return + + # Skip files with no valid records + if not recs: + logging.warning(f"No valid records found in file: {filepath}. Skipping...") + return + + df = pd.DataFrame(recs) + df.to_parquet(parquet_file) + logging.info(f"Created parquet file {parquet_file} with {len(recs)} records") + +def create_per_node_parquet_files(files, output_dir): + output_dir = Path(output_dir) / "parquet_files" + output_dir.mkdir(parents=True, exist_ok=True) + max_workers = min(64, len(files), os.cpu_count() or 1) + with ProcessPoolExecutor(max_workers=max_workers) as executor: + list( + tqdm( + executor.map(parse_file, files, [output_dir] * len(files)), + total=len(files), + desc="Processing files", + unit="file", + ) + ) + return output_dir + +def generate_scatter_plot(df, comm_type, coll_type, output_file): + plt.figure(figsize=(10, 6), dpi=100) + distinct_msg_sizes = df["coll_msg_size_bytes"].unique() + + for msg_size in distinct_msg_sizes: + df_msg_size = df[df["coll_msg_size_bytes"] == msg_size] + mean_busbw = df_msg_size["mean_coll_busbw_gbs"].mean() + plt.scatter( + df_msg_size["coll_sn"], + df_msg_size["mean_coll_busbw_gbs"], + label=f"MsgSize: {bytes_to_human_readable(msg_size)} (Mean: {mean_busbw:.2f} GB/s)", + alpha=0.5, + ) + + plt.xlabel("Operation Sequence Number") + plt.ylabel("Mean Collective Bus BW (GB/s)") + plt.title(f"Comm Type: {comm_type}, Coll Type: {coll_type}") + plt.legend(title="Message Size", loc="upper right") + plt.tight_layout() + plt.savefig(output_file) + plt.close() + logging.info(f"Scatter plot saved to {output_file}") + +def generate_combined_scatter_plot(df, comm_type, coll_type, output_file, max_cols=3): + distinct_msg_sizes = df["coll_msg_size_bytes"].unique() + num_plots = len(distinct_msg_sizes) + + # Compute number of rows and columns + num_cols = min(max_cols, num_plots) # Limit max columns + num_rows = (num_plots + num_cols - 1) // num_cols # Calculate rows dynamically + + # Create figure with GridSpec + fig = plt.figure(figsize=(5 * num_cols, 5 * num_rows), dpi=100) + gs = GridSpec(num_rows, num_cols, figure=fig) + + for i, msg_size in enumerate(distinct_msg_sizes): + row, col = divmod(i, num_cols) # Determine row & column index + ax = fig.add_subplot(gs[row, col]) # Create subplot at position + + df_msg_size = df[df["coll_msg_size_bytes"] == msg_size] + mean_busbw = df_msg_size["mean_coll_busbw_gbs"].mean() + ax.scatter( + df_msg_size["coll_sn"], + df_msg_size["mean_coll_busbw_gbs"], + label=f"MsgSize: {bytes_to_human_readable(msg_size)} (Mean: {mean_busbw:.2f} GB/s)", + alpha=0.5, + ) + ax.set_xlabel("Op Seq No") + ax.set_ylabel("Mean Collective Bus BW (GB/s)") + ax.set_title(f"Message Size: {bytes_to_human_readable(msg_size)}({msg_size})") + ax.legend(loc="upper right") + + fig.suptitle(f"Comm Type: {comm_type}, Coll Type: {coll_type}", ha="center", y=0.98) + + plt.tight_layout() + plt.savefig(output_file) + plt.close() + logging.info(f"Combined scatter plot saved to {output_file}") + +def generate_histogram(df, comm_type, coll_type, output_file, message_size): + plt.figure(figsize=(10, 6), dpi=100) + data_range = df["mean_coll_busbw_gbs"].max() - df["mean_coll_busbw_gbs"].min() + num_bins = min(50, int(data_range) + 1) + plt.hist( + df["mean_coll_busbw_gbs"], + bins=num_bins, + alpha=0.7, + color="b", + edgecolor="black", + linewidth=1.2, + ) + plt.xlabel("Mean Collective Bus BW (GB/s)") + plt.ylabel("Frequency") + plt.title( + f"Comm Type: {comm_type}, Coll Type: {coll_type} Mean Collective Bus BW Histogram\nMsg Size: {message_size}" + ) + plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{y:.0f}")) + plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:.2f} GB/s")) + plt.gca().xaxis.get_offset_text().set_visible(False) + plt.tight_layout() + plt.savefig(output_file) + plt.close() + logging.info(f"Histogram saved to {output_file}") + +def generate_boxplot(df, comm_type, coll_type, output_file, message_size): + plt.figure(figsize=(10, 6)) + boxprops = dict(linestyle="-", linewidth=2, color="blue") + flierprops = dict(marker="o", color="red", alpha=0.5) + medianprops = dict(linestyle="-", linewidth=2.5, color="orange") + whiskerprops = dict(linestyle="--", linewidth=2, color="green") + capprops = dict(linestyle="-", linewidth=2, color="black") + + plt.boxplot( + df["mean_coll_busbw_gbs"], + vert=False, + patch_artist=True, + boxprops=boxprops, + flierprops=flierprops, + medianprops=medianprops, + whiskerprops=whiskerprops, + capprops=capprops, + ) + + plt.xlabel("Mean Coll Bus BW (GB/s)") + plt.title( + f"Box Plot of Coll Bus BW (CommType: {comm_type} - Coll Type: {coll_type} - Msg Size: {message_size})" + ) + + # Adding labels for min, max, and median + stats = df["mean_coll_busbw_gbs"].describe(percentiles=[0.5]) + plt.annotate( + f"Min: {stats['min']:.2f}", + xy=(stats["min"], 1), + xytext=(stats["min"], 1.1), + arrowprops=dict(facecolor="black", shrink=0.05), + ) + plt.annotate( + f"Median: {stats['50%']:.2f}", + xy=(stats["50%"], 1), + xytext=(stats["50%"], 1.1), + arrowprops=dict(facecolor="black", shrink=0.05), + ) + plt.annotate( + f"Max: {stats['max']:.2f}", + xy=(stats["max"], 1), + xytext=(stats["max"], 1.1), + arrowprops=dict(facecolor="black", shrink=0.05), + ) + + plt.tight_layout() + plt.savefig(output_file) + plt.close() + logging.info(f"Box plot saved to {output_file}") + + +def summarize_data_per_comm_coll_type(output_root, comm_type, coll_type, output_dir_name): + """Summarize parquet data per communication and collective type using DuckDB""" + logging.info(f"Summarizing data per comm/coll type for {output_dir_name}, {comm_type} and {coll_type}") + + # Check if there are any parquet files + parquet_dir = output_root / "parquet_files" + parquet_files = list(parquet_dir.glob("*.parquet")) + if not parquet_files: + logging.warning(f"No parquet files found for {comm_type} and {coll_type}") + return None + + # Clean up invalid/empty parquet files by moving them to a separate directory + invalid_dir = parquet_dir / "invalid" + invalid_dir.mkdir(exist_ok=True) + + invalid_count = 0 + for pf in parquet_files: + try: + # Check file size first + if pf.stat().st_size == 0: + logging.warning(f"Moving zero-byte parquet file {pf} to invalid directory") + pf.rename(invalid_dir / pf.name) + invalid_count += 1 + continue + + # Use pyarrow to check parquet metadata without reading data + import pyarrow.parquet as pq + parquet_file = pq.ParquetFile(pf) + if parquet_file.metadata.num_rows == 0: + logging.warning(f"Moving empty parquet file {pf} (0 rows) to invalid directory") + pf.rename(invalid_dir / pf.name) + invalid_count += 1 + except Exception as e: + logging.warning(f"Moving invalid parquet file {pf} to invalid directory: {e}") + pf.rename(invalid_dir / pf.name) + invalid_count += 1 + + # Check if any valid files remain + remaining_files = list(parquet_dir.glob("*.parquet")) + if not remaining_files: + logging.warning(f"No valid parquet files found for {comm_type} and {coll_type} (moved {invalid_count} invalid files)") + return None + + logging.info(f"Found {len(remaining_files)} valid parquet files (moved {invalid_count} invalid files)") + + try: + duckdb.execute( + f"CREATE OR REPLACE VIEW logs AS SELECT * FROM read_parquet('{parquet_dir}/*.parquet')" + ) + df = duckdb.execute(f""" + SELECT + id, + coll_sn, + coll_msg_size_bytes, + AVG(coll_busbw_gbs) as mean_coll_busbw_gbs, + COUNT(*) as log_count, + ARRAY_DISTINCT(LIST(n_ranks)) as n_ranks, + ARRAY_DISTINCT(LIST(nnodes)) as nnodes, + MIN(dump_timestamp_us) as coll_start_timestamp_us, + MAX(dump_timestamp_us) as coll_end_timestamp_us, + (MAX(dump_timestamp_us) - MIN(dump_timestamp_us)) as coll_duration_us + FROM logs + WHERE coll = '{coll_type}' and comm_type = '{comm_type}' + GROUP BY id, coll_sn, coll_msg_size_bytes + ORDER BY coll_sn + """).df() + except Exception as e: + logging.error(f"Error executing DuckDB query for {comm_type} and {coll_type}: {e}") + return None + + if df.empty: + logging.info(f"No data for {comm_type} and {coll_type}") + return None + + # Add human-readable formatting + df["human_readable_coll_msg_size_bytes"] = df["coll_msg_size_bytes"].apply( + bytes_to_human_readable + ) + + # Log example of time range data for first few rows + if len(df) > 0: + sample_row = df.iloc[0] + start_time = timestamp_to_datetime(sample_row['coll_start_timestamp_us']) + end_time = timestamp_to_datetime(sample_row['coll_end_timestamp_us']) + duration = microseconds_to_human_readable(sample_row['coll_duration_us']) + logging.info(f"Example time range - ID: {sample_row['id']}, Coll_SN: {sample_row['coll_sn']}, " + f"Start: {start_time}, End: {end_time}, Duration: {duration}") + + return df + + +def generate_visualizations(df, output_root, comm_type, coll_type): + """Generate all visualizations and save CSV files for the processed data""" + logging.info(f"Generating visualizations for {comm_type} and {coll_type}") + + summary_dir = output_root / "summary" + summary_dir.mkdir(parents=True, exist_ok=True) + + # Scatter Plot for all message sizes + output_file = summary_dir / f"scatter_plot_{comm_type}_{coll_type}.png" + generate_scatter_plot(df, comm_type, coll_type, output_file) + + # Combined Scatter Plot for all message sizes + output_file = summary_dir / f"combined_scatter_plot_{comm_type}_{coll_type}.png" + generate_combined_scatter_plot(df, comm_type, coll_type, output_file) + + distinct_msg_sizes = df["coll_msg_size_bytes"].unique() + for msg_size in distinct_msg_sizes: + hr_msg_size = bytes_to_human_readable(msg_size) + msg_size_dir = summary_dir / f"msg_size_{msg_size}_{hr_msg_size}" + msg_size_hist_dir = msg_size_dir / "histograms" + msg_size_boxplot_dir = msg_size_dir / "boxplots" + msg_size_dir.mkdir(parents=True, exist_ok=True) + msg_size_hist_dir.mkdir(parents=True, exist_ok=True) + msg_size_boxplot_dir.mkdir(parents=True, exist_ok=True) + + df_msg_size = df[df["coll_msg_size_bytes"] == msg_size] + + # Add human-readable time formatting + df_msg_size = df_msg_size.copy() + df_msg_size["coll_start_datetime"] = df_msg_size["coll_start_timestamp_us"].apply(timestamp_to_datetime) + df_msg_size["coll_end_datetime"] = df_msg_size["coll_end_timestamp_us"].apply(timestamp_to_datetime) + df_msg_size["coll_duration_human"] = df_msg_size["coll_duration_us"].apply(microseconds_to_human_readable) + + # Histogram + output_file = ( + msg_size_hist_dir / f"histogram_{comm_type}_{coll_type}_{msg_size}.png" + ) + generate_histogram( + df_msg_size, + comm_type, + coll_type, + output_file, + bytes_to_human_readable(msg_size), + ) + + # Box Plot + output_file = ( + msg_size_boxplot_dir / f"boxplot_{comm_type}_{coll_type}_{msg_size}.png" + ) + generate_boxplot( + df_msg_size, + comm_type, + coll_type, + output_file, + bytes_to_human_readable(msg_size), + ) + + output_file = msg_size_dir / f"summary_{comm_type}_{coll_type}_{msg_size}.csv" + df_msg_size.to_csv(output_file, index=False) + logging.info( + f"Summary for {comm_type}, {coll_type}, and msg_size {msg_size} written to {output_file}" + ) + + +def generate_summary(output_root, comm_type, coll_type, output_dir_name): + """Generate summary by summarizing data per comm/coll type and creating visualizations""" + logging.info(f"Generating summary for {output_dir_name}, {comm_type} and {coll_type}") + + # Step 1: Summarize data per communication and collective type + df = summarize_data_per_comm_coll_type(output_root, comm_type, coll_type, output_dir_name) + + # Step 2: Generate visualizations if data exists + if df is not None: + generate_visualizations(df, output_root, comm_type, coll_type) + else: + logging.warning(f"No data found for {comm_type} and {coll_type} - skipping visualization generation") + + +def generate_summary_wrapper(args): + return generate_summary(*args) + + +if __name__ == "__main__": + files, output_dir_name = get_log_files_and_output_dir() + print(f"Number of log files found: {len(files)}") + print(f"Output directory: {output_dir_name}") + output_dir = Path(output_dir_name) + output_dir.mkdir(parents=True, exist_ok=True) + setup_logging(output_dir) + create_per_node_parquet_files(files, output_dir) + comm_types = ["single-rank", "nvlink-only", "hca-only", "mixed"] + coll_types = ["AllReduce", "AllGather", "ReduceScatter", "Broadcast"] + summary_args = [ + (output_dir, comm_type, coll_type, output_dir_name) + for comm_type in comm_types + for coll_type in coll_types + ] + max_workers = min(64, len(summary_args), os.cpu_count() or 1) + with ProcessPoolExecutor(max_workers=max_workers) as executor: + list( + tqdm( + executor.map(generate_summary_wrapper, summary_args), + total=len(summary_args), + desc="Generating summaries", + ) + ) + print("Done!") diff --git a/ext-profiler/inspector/exporter/example/requirements.txt b/ext-profiler/inspector/exporter/example/requirements.txt new file mode 100644 index 000000000..8a47aae51 --- /dev/null +++ b/ext-profiler/inspector/exporter/example/requirements.txt @@ -0,0 +1,6 @@ +pandas>=1.3.0 +tqdm>=4.60.0 +duckdb>=0.8.0 +matplotlib>=3.3.0 +pyarrow>=5.0.0 +numpy>=1.21.0 diff --git a/ext-profiler/inspector/inspector.cc b/ext-profiler/inspector/inspector.cc new file mode 100644 index 000000000..0cb9371d5 --- /dev/null +++ b/ext-profiler/inspector/inspector.cc @@ -0,0 +1,1530 @@ +#include "inspector.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +#define JSON_CHK(expr) \ + do { \ + const jsonResult_t res = (expr); \ + if (res != jsonSuccess) { \ + INFO(NCCL_INSPECTOR, "jsonError: %s\n", jsonErrorString(res)); \ + return inspectorJsonError; \ + } \ + } while (0) + +#define INS_CHK(call) \ + do { \ + inspectorResult_t res = call; \ + if (inspectorSuccess != res) { \ + INFO(NCCL_INSPECTOR, "%s:%d -> error %d: %s", __FILE__, __LINE__, res, \ + inspectorErrorString(res)); \ + return res; \ + } \ + } while (0); + +#define JSON_CHK_GOTO(expr, res, label) \ + do { \ + const jsonResult_t macro_res = (expr); \ + if (macro_res != jsonSuccess) { \ + INFO(NCCL_INSPECTOR, "jsonError: %s\n", jsonErrorString(macro_res)); \ + res = inspectorJsonError; \ + goto label; \ + } \ + } while (0) + +#define INS_CUDA_CHK(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + INFO(NCCL_INSPECTOR, "Cuda failure '%s'", cudaGetErrorString(err)); \ + return inspectorCudaError; \ + } \ + } while (false) + + +// Global flag to control inspector use +static bool enableNcclInspector = false; +// Global flag to control starting internal dump thread +static bool enableNcclInspectorDumpThread = false; +// Global flag to control verbose dumping (event_trace) +static bool enableNcclInspectorDumpVerbose = false; +// Extra guard to prevent spurious messages for eager pollers that try to dump +// out results before we have initialized +static bool ncclInspectorInit = false; + +// Define the global logFn variable +ncclDebugLogger_t logFn = nullptr; + +/* + * Description: + * + * Returns the current time in microseconds since the epoch. + * + * Thread Safety: + * + * Thread-safe (uses gettimeofday). + * + * Input: + * + * None. + * + * Output: + * + * None. + * + * Return: + * uint64_t - current time in microseconds. + * + * Error Handling: + * This function uses gettimeofday() which rarely fails. In case of + * failure, the function returns 0. Callers should check for 0 return + * value if precise error handling is required. + * + */ +uint64_t inspectorGetTime() { + uint64_t ts = 0; + timeval tv; + + gettimeofday(&tv, 0); + ts = tv.tv_sec * 1000000 + tv.tv_usec; + return ts; +} + +/* + * Description: + * + * Converts a string to the corresponding ncclDataType_t enum value. + * + * Thread Safety: + * Thread-safe (read-only string input). + * + * Input: + * + * const char* str - string representation of the datatype. + * + * Output: + * + * None. + * + * Return: + * + * ncclDataType_t - corresponding enum value, or -1 if unknown. + * + */ +ncclDataType_t inspectorStringToDatatype(const char* str) { + if (strcmp(str, "ncclInt8") == 0) return ncclInt8; + if (strcmp(str, "ncclInt32") == 0) return ncclInt32; + if (strcmp(str, "ncclUint32") == 0) return ncclUint32; + if (strcmp(str, "ncclInt64") == 0) return ncclInt64; + if (strcmp(str, "ncclUint64") == 0) return ncclUint64; + if (strcmp(str, "ncclFloat16") == 0) return ncclFloat16; + if (strcmp(str, "ncclFloat32") == 0) return ncclFloat32; + if (strcmp(str, "ncclFloat64") == 0) return ncclFloat64; + if (strcmp(str, "ncclBfloat16") == 0) return ncclBfloat16; + if (strcmp(str, "ncclFloat8e4m3") == 0) return ncclFloat8e4m3; + if (strcmp(str, "ncclFloat8e5m2") == 0) return ncclFloat8e5m2; + return (ncclDataType_t)-1; // Or handle error as appropriate +} + +/* + * Description: + * + * Converts a string to the corresponding ncclFunc_t enum value. + * + * Thread Safety: + * Thread-safe (read-only string input). + * + * Input: + * const char* str - string representation of the function (must not be NULL). + * + * Output: + * None. + * + * Return: + * ncclFunc_t - corresponding enum value, or ncclNumFuncs if unknown. + * + * Preconditions: + * - str must not be NULL + */ +ncclFunc_t ncclStringToFunc(const char* str) { + if (strcmp(str, "AllGather") == 0) return ncclFuncAllGather; + if (strcmp(str, "AllReduce") == 0) return ncclFuncAllReduce; + if (strcmp(str, "Broadcast") == 0) return ncclFuncBroadcast; + if (strcmp(str, "Recv") == 0) return ncclFuncRecv; + if (strcmp(str, "Reduce") == 0) return ncclFuncReduce; + if (strcmp(str, "ReduceScatter") == 0) return ncclFuncReduceScatter; + if (strcmp(str, "SendRecv") == 0) return ncclFuncSendRecv; + if (strcmp(str, "Send") == 0) return ncclFuncSend; + return ncclNumFuncs; // Invalid / unknown +} + +const char* ncclFuncToString(ncclFunc_t fn) { + switch (fn) { + case ncclFuncAllGather: return "AllGather"; + case ncclFuncAllReduce: return "AllReduce"; + case ncclFuncBroadcast: return "Broadcast"; + case ncclFuncRecv: return "Recv"; + case ncclFuncReduce: return "Reduce"; + case ncclFuncReduceScatter: return "ReduceScatter"; + case ncclFuncSendRecv: return "SendRecv"; + case ncclFuncSend: return "Send"; + default: return "Invalid"; + } +} + +struct inspectorDumpThread; +static inspectorDumpThread* dumper = nullptr; + +#define UNUSED(x) (void)(x) + +inspectorResult_t inspectorLockInit(pthread_rwlock_t* lockRef) { + if (0 != pthread_rwlock_init(lockRef, nullptr)) { + return inspectorLockError; + } else { + return inspectorSuccess; + } +} + +inspectorResult_t inspectorLockDestroy(pthread_rwlock_t* lockRef) { + if (0 != pthread_rwlock_destroy(lockRef)) { + return inspectorLockError; + } else { + return inspectorSuccess; + } +} + +inspectorResult_t inspectorLockRd(pthread_rwlock_t* lockRef) { + if (0 != pthread_rwlock_rdlock(lockRef)) { + return inspectorLockError; + } else { + return inspectorSuccess; + } +} + +inspectorResult_t inspectorLockWr(pthread_rwlock_t* lockRef) { + if (0 != pthread_rwlock_wrlock(lockRef)) { + return inspectorLockError; + } else { + return inspectorSuccess; + } +} + +inspectorResult_t inspectorUnlockRWLock(pthread_rwlock_t* lockRef) { + if (0 != pthread_rwlock_unlock(lockRef)) { + return inspectorLockError; + } else { + return inspectorSuccess; + } +} + +// TODO inspect these retvals +#define INSPECTOR_LOCK_RD_FLAG(lockRef, lockFlag, debug) \ + do { \ + if (!lockFlag) { \ + INS_CHK(inspectorLockRd(lockRef)); \ + } \ + lockFlag = true; \ + } while (0); + +#define INSPECTOR_LOCK_WR_FLAG(lockRef, lockFlag, debug) \ + do { \ + if (!lockFlag) { \ + INS_CHK(inspectorLockWr(lockRef)); \ + } \ + lockFlag = true; \ + } while (0); + +#define INSPECTOR_UNLOCK_RW_LOCK_FLAG(lockRef, lockFlag, debug) \ + do { \ + if (lockFlag) { \ + INS_CHK(inspectorUnlockRWLock(lockRef)); \ + } \ + lockFlag = false; \ + } while (0); + +struct inspectorCommInfoList { + struct inspectorCommInfo* comms; + uint32_t ncomms; + pthread_rwlock_t guard; +}; + +struct inspectorState { + struct inspectorCommInfoList liveComms; + struct inspectorCommInfoList deletedComms; +}; + + +static inspectorState g_state; + +static inspectorResult_t inspectorCommInfoListInit(struct inspectorCommInfoList* commList) { + if (commList->comms) { + return inspectorGlobalInitError; + } + commList->comms = nullptr; + commList->ncomms = 0; + INS_CHK(inspectorLockInit(&commList->guard)); + return inspectorSuccess; +} + +static inspectorResult_t inspectorGlobalStateInit() { + memset(&g_state, 0, sizeof(struct inspectorState)); + INS_CHK(inspectorCommInfoListInit(&g_state.liveComms)); + INS_CHK(inspectorCommInfoListInit(&g_state.deletedComms)); + return inspectorSuccess; +} + +/* + * Description: + * + * Converts inspectorTimingSource_t enum to a string representation. + * + * Thread Safety: + * Thread-safe (read-only operation). + * + * Input: + * inspectorTimingSource_t timingSource - timing source enum value. + * + * Output: + * None. + * + * Return: + * const char* - string representation of the timing source. + */ +static const char* inspectorTimingSourceToString(inspectorTimingSource_t timingSource) { + switch (timingSource) { + case inspectorTimingSourceKernelGpu: + return "kernel_gpu"; + case inspectorTimingSourceKernelCpu: + return "kernel_cpu"; + case inspectorTimingSourceCollectiveCpu: + return "collective_cpu"; + default: + return "unknown"; + } +} + +/* + * Description: + * + * Writes the header information for a communicator to the JSON output. + * + * Thread Safety: + * Not thread-safe (should be called with proper locking). + * + * Input: + * jsonFileOutput* jfo - JSON output handle. + * struct inspectorCommInfo* commInfo - communicator info. + * + * Output: + * Header is written to JSON output. + * + * Return: + * inspectorResult_t - success or error code. + * + */ +static inspectorResult_t inspectorCommInfoHeader(jsonFileOutput* jfo, + struct inspectorCommInfo* commInfo) { + JSON_CHK(jsonStartObject(jfo)); + JSON_CHK(jsonKey(jfo, "id")); JSON_CHK(jsonStr(jfo, commInfo->commHashStr)); + JSON_CHK(jsonKey(jfo, "rank")); JSON_CHK(jsonInt(jfo, commInfo->rank)); + JSON_CHK(jsonKey(jfo, "n_ranks")); JSON_CHK(jsonInt(jfo, commInfo->nranks)); + JSON_CHK(jsonKey(jfo, "nnodes")); JSON_CHK(jsonUint64(jfo, commInfo->nnodes)); + JSON_CHK(jsonFinishObject(jfo)); + return inspectorSuccess; +} + +/* + * Description: + * + * Writes metadata header information to the JSON output. + * + * Thread Safety: + * Not thread-safe (should be called with proper locking). + * + * Input: + * jsonFileOutput* jfo - JSON output handle. + * + * Output: + * Metadata header is written to JSON output. + * + * Return: + * inspectorResult_t - success or error code. + * + */ +static inspectorResult_t inspectorCommInfoMetaHeader(jsonFileOutput* jfo) { + JSON_CHK(jsonStartObject(jfo)); + { + JSON_CHK(jsonKey(jfo, "inspector_output_format_version")); JSON_CHK(jsonStr(jfo, "v4.0")); + JSON_CHK(jsonKey(jfo, "git_rev")); JSON_CHK(jsonStr(jfo, get_git_version_info())); + JSON_CHK(jsonKey(jfo, "rec_mechanism")); JSON_CHK(jsonStr(jfo, "nccl_profiler_interface")); + JSON_CHK(jsonKey(jfo, "dump_timestamp_us")); JSON_CHK(jsonUint64(jfo, inspectorGetTime())); + char hostname[256]; + gethostname(hostname, 255); + JSON_CHK(jsonKey(jfo, "hostname")); JSON_CHK(jsonStr(jfo, hostname)); + JSON_CHK(jsonKey(jfo, "pid")); JSON_CHK(jsonUint64(jfo, getpid())); + } + JSON_CHK(jsonFinishObject(jfo)); + return inspectorSuccess; +} + +/* + * Description: + * + * Writes verbose information (event_trace) for a completed + * collective operation to the JSON output. + * + * Thread Safety: + * Not thread-safe (should be called with proper locking). + * + * Input: + * jsonFileOutput* jfo - JSON output handle. + * const struct inspectorCompletedCollInfo* collInfo - completed + * collective info. + * + * Output: + * Verbose collective info is written to JSON output. + * + * Return: + * inspectorResult_t - success or error code. + * + */ +static inline inspectorResult_t inspectorCompletedCollVerbose(jsonFileOutput* jfo, + struct inspectorCompletedCollInfo* collInfo) { + // Add event trace information + JSON_CHK(jsonKey(jfo, "event_trace_sn")); + JSON_CHK(jsonStartObject(jfo)); + { + // Collective events + JSON_CHK(jsonKey(jfo, "coll_start_sn")); JSON_CHK(jsonUint64(jfo, collInfo->collEvtTrk.evntTrace[NCCL_INSP_EVT_TRK_COLL_START].sn)); + JSON_CHK(jsonKey(jfo, "coll_stop_sn")); JSON_CHK(jsonUint64(jfo, collInfo->collEvtTrk.evntTrace[NCCL_INSP_EVT_TRK_COLL_STOP].sn)); + + // Kernel events + JSON_CHK(jsonKey(jfo, "kernel_events")); + JSON_CHK(jsonStartList(jfo)); + for (uint32_t ch = 0; ch < collInfo->collEvtTrk.nChannels; ch++) { + JSON_CHK(jsonStartObject(jfo)); + JSON_CHK(jsonKey(jfo, "channel_id")); JSON_CHK(jsonInt(jfo, ch)); + JSON_CHK(jsonKey(jfo, "kernel_start_sn")); JSON_CHK(jsonUint64(jfo, collInfo->collEvtTrk.kernelCh[ch].evntTrace[NCCL_INSP_EVT_TRK_KERNEL_START].sn)); + JSON_CHK(jsonKey(jfo, "kernel_stop_sn")); JSON_CHK(jsonUint64(jfo, collInfo->collEvtTrk.kernelCh[ch].evntTrace[NCCL_INSP_EVT_TRK_KERNEL_STOP].sn)); + JSON_CHK(jsonKey(jfo, "kernel_record_sn")); JSON_CHK(jsonUint64(jfo, collInfo->collEvtTrk.kernelCh[ch].evntTrace[NCCL_INSP_EVT_TRK_KERNEL_RECORD].sn)); + JSON_CHK(jsonFinishObject(jfo)); + } + JSON_CHK(jsonFinishList(jfo)); + } + JSON_CHK(jsonFinishObject(jfo)); + + JSON_CHK(jsonKey(jfo, "event_trace_ts")); + JSON_CHK(jsonStartObject(jfo)); + { + // Collective events + JSON_CHK(jsonKey(jfo, "coll_start_ts")); JSON_CHK(jsonUint64(jfo, collInfo->collEvtTrk.evntTrace[NCCL_INSP_EVT_TRK_COLL_START].ts)); + JSON_CHK(jsonKey(jfo, "coll_stop_ts")); JSON_CHK(jsonUint64(jfo, collInfo->collEvtTrk.evntTrace[NCCL_INSP_EVT_TRK_COLL_STOP].ts)); + + // Kernel events + JSON_CHK(jsonKey(jfo, "kernel_events")); + JSON_CHK(jsonStartList(jfo)); + for (uint32_t ch = 0; ch < collInfo->collEvtTrk.nChannels; ch++) { + JSON_CHK(jsonStartObject(jfo)); + JSON_CHK(jsonKey(jfo, "channel_id")); JSON_CHK(jsonInt(jfo, ch)); + JSON_CHK(jsonKey(jfo, "kernel_start_ts")); JSON_CHK(jsonUint64(jfo, collInfo->collEvtTrk.kernelCh[ch].evntTrace[NCCL_INSP_EVT_TRK_KERNEL_START].ts)); + JSON_CHK(jsonKey(jfo, "kernel_stop_ts")); JSON_CHK(jsonUint64(jfo, collInfo->collEvtTrk.kernelCh[ch].evntTrace[NCCL_INSP_EVT_TRK_KERNEL_STOP].ts)); + JSON_CHK(jsonKey(jfo, "kernel_record_ts")); JSON_CHK(jsonUint64(jfo, collInfo->collEvtTrk.kernelCh[ch].evntTrace[NCCL_INSP_EVT_TRK_KERNEL_RECORD].ts)); + JSON_CHK(jsonFinishObject(jfo)); + } + JSON_CHK(jsonFinishList(jfo)); + } + JSON_CHK(jsonFinishObject(jfo)); + + return inspectorSuccess; +} + +/* + * Description: + * + * Writes completed collective operation information to the JSON + * output. + * + * Thread Safety: + * Not thread-safe (should be called with proper locking). + * + * Input: + * jsonFileOutput* jfo - JSON output handle. + * const struct inspectorCompletedCollInfo* collInfo - completed + * collective info. + * + * Output: + * Collective info is written to JSON output. + * + * Return: + * inspectorResult_t - success or error code. + * + */ +static inline inspectorResult_t inspectorCompletedColl(jsonFileOutput* jfo, + struct inspectorCompletedCollInfo* collInfo) { + JSON_CHK(jsonStartObject(jfo)); + { + + JSON_CHK(jsonKey(jfo, "coll")); JSON_CHK(jsonStr(jfo, ncclFuncToString(collInfo->func))); + + JSON_CHK(jsonKey(jfo, "coll_sn")); JSON_CHK(jsonUint64(jfo, collInfo->sn)); + + JSON_CHK(jsonKey(jfo, "coll_msg_size_bytes")); JSON_CHK(jsonUint64(jfo, collInfo->msgSizeBytes)); + + JSON_CHK(jsonKey(jfo, "coll_exec_time_us")); JSON_CHK(jsonUint64(jfo, collInfo->execTimeUsecs)); + + JSON_CHK(jsonKey(jfo, "coll_timing_source")); JSON_CHK(jsonStr(jfo, inspectorTimingSourceToString(collInfo->timingSource))); + + JSON_CHK(jsonKey(jfo, "coll_algobw_gbs")); JSON_CHK(jsonDouble(jfo, collInfo->algoBwGbs)); + + JSON_CHK(jsonKey(jfo, "coll_busbw_gbs")); JSON_CHK(jsonDouble(jfo, collInfo->busBwGbs)); + + if (enableNcclInspectorDumpVerbose) { + INS_CHK(inspectorCompletedCollVerbose(jfo, collInfo)); + } + } + JSON_CHK(jsonFinishObject(jfo)); + + return inspectorSuccess; +} + + +/* + * Description: + * + * Dumps the state of a communicator to the JSON output if needed. + * + * Thread Safety: + * Not thread-safe (should be called with proper locking). + * + * Input: + * jsonFileOutput* jfo - JSON output handle. + * inspectorCommInfo* commInfo - communicator info. + * bool* needs_writing - set to true if output was written. + * + * Output: + * State is dumped to JSON output if needed. + * + * Return: + * inspectorResult_t - success or error code. + * + */ +static inspectorResult_t inspectorCommInfoDump(jsonFileOutput* jfo, + inspectorCommInfo* commInfo, + bool* needs_writing) { + *needs_writing = false; + + if (commInfo == nullptr) + return inspectorSuccess; + + struct inspectorCompletedCollInfo collInfo; + memset(&collInfo, 0, sizeof(struct inspectorCompletedCollInfo)); + + inspectorLockWr(&commInfo->guard); + if (commInfo->dump) { + *needs_writing = true; + memcpy(&collInfo, + &commInfo->completedCollInfo, + sizeof(struct inspectorCompletedCollInfo)); + commInfo->dump = false; + } + inspectorUnlockRWLock(&commInfo->guard); + + if (*needs_writing) { + JSON_CHK(jsonLockOutput(jfo)); + JSON_CHK(jsonStartObject(jfo)); + { + JSON_CHK(jsonKey(jfo, "header")); + inspectorCommInfoHeader(jfo, commInfo); + + JSON_CHK(jsonKey(jfo, "metadata")); + inspectorCommInfoMetaHeader(jfo); + + JSON_CHK(jsonKey(jfo, "coll_perf")); + INS_CHK(inspectorCompletedColl(jfo, &collInfo)); + } + JSON_CHK(jsonFinishObject(jfo)); + JSON_CHK(jsonNewline(jfo)); + JSON_CHK(jsonUnlockOutput(jfo)); + } + return inspectorSuccess; +} + + +/* + * Description: + * + * Dumps the state of all communicators in a commList to the JSON + * output. + * + * Thread Safety: + * Thread-safe - assumes no locks are taken and acquires all necessary + * locks to iterate through all communicator objects and dump their state. + * + * Input: + * jsonFileOutput* jfo - JSON output handle (must not be NULL). + * struct inspectorCommInfoList* commList - list of communicators (must not be NULL). + * + * Output: + * State of all communicators is dumped to JSON output. + * + * Return: + * inspectorResult_t - success or error code. + * + */ +static inspectorResult_t inspectorCommInfoListDump(jsonFileOutput* jfo, + struct inspectorCommInfoList* commList) { + bool flush = false; + INS_CHK(inspectorLockRd(&commList->guard)); + inspectorResult_t res = inspectorSuccess; + if (commList->ncomms > 0) { + for (struct inspectorCommInfo* itr = commList->comms; + itr != nullptr; + itr = itr->next) { + bool needs_writing; + INS_CHK_GOTO(inspectorCommInfoDump(jfo, itr, &needs_writing), res, finalize); + if (needs_writing) { + flush = true; + } + } + if (flush) { + JSON_CHK_GOTO(jsonLockOutput(jfo), res, finalize); + JSON_CHK_GOTO(jsonFlushOutput(jfo), res, finalize); + JSON_CHK_GOTO(jsonUnlockOutput(jfo), res, finalize); + } + } +finalize: + INS_CHK(inspectorUnlockRWLock(&commList->guard)); + return res; +} + +/* + * Description: + * Finalizes and cleans up a commList, freeing all communicators. + * + * Thread Safety: + * Not thread-safe (should be called with proper locking). + * + * Input: + * struct commList* commList - list of communicators. + * + * Output: + * All communicators are freed. + * + * Return: + * inspectorResult_t - success or error code. + * + */ +static inspectorResult_t inspectorCommInfoListFinalize(struct inspectorCommInfoList* commList) { + struct inspectorCommInfo* nextComm = nullptr; + INS_CHK(inspectorLockWr(&commList->guard)); + while (commList->comms != nullptr && commList->ncomms != 0) { + INFO(NCCL_INSPECTOR, "NCCL Inspector: comm %lu still in tracker", + commList->comms->commHash); + nextComm = commList->comms->next; + INS_CHK(inspectorLockDestroy(&commList->comms->guard)); + free(commList->comms); + commList->comms = nextComm; + commList->ncomms--; + } + INS_CHK(inspectorUnlockRWLock(&commList->guard)); + return inspectorSuccess; +} + +/* + * Description: + * + * Ensures the given directory exists and is writable, creating it + * if necessary. + * + * Thread Safety: + * Not thread-safe (should be called during initialization). + * + * Input: + * char* workdir - directory path. + * + * Output: + * Directory is created if needed. + * + * Return: + * + * bool - true if directory exists and is writable, false otherwise. + * + */ +static bool ensureDir(char* workdir) { + struct stat st; + + // Check if directory exists + if (stat(workdir, &st) == 0) { + if (S_ISDIR(st.st_mode)) { + // Directory exists, check if it's writable + if (access(workdir, W_OK) == 0) { + return true; // Directory exists and is writable + } else { + INFO(NCCL_INSPECTOR, + "NCCL Inspectoer: dump directory %s exists, but is not " + "writable", + workdir); + return false; + } + } else { + INFO(NCCL_INSPECTOR, + "NCCL Inspector: dump location %s exists, but is not a " + "directory", + workdir); + return false; + } + } else { + // Directory doesn't exist, try to create it + const mode_t mode = 0777; + if (mkdir(workdir, mode) == 0) { + return true; // Directory created successfully + } else { + INFO(NCCL_INSPECTOR, + "NCCL Inspector: failed to create dump directory %s: %s", workdir, + strerror(errno)); + return false; + } + } +} + +/* + * Description: + * + * Generates the output dump directory path based on environment + * variables. + * + * Thread Safety: + * Not thread-safe (should be called during initialization). + * + * Input: + * char** workdir - pointer to output directory string. + * + * Output: + * workdir is set to the generated directory path. + * + * Return: + * None. + */ +static void genDumpDir(char** workdir) { + char* dumpdir = getenv("NCCL_INSPECTOR_DUMP_DIR"); + if (dumpdir != NULL) { + *workdir = strdup(dumpdir); + // TODO check errors here + return; + } + + char* jobid = getenv("SLURM_JOBID"); + bool badJobId = true; + if (jobid != NULL) { + errno = 0; + const int intid = strtol(jobid, NULL, 10); + if (errno == 0) { + char tmp[2048]; + snprintf(tmp, 2048, "nccl-inspector-%d", intid); + *workdir = strdup(tmp); + badJobId = false; + } + } + + if (badJobId) { + *workdir = strdup("nccl-inspector-unknown-jobid"); + } +} + +struct inspectorDumpThread { + bool run{false}; + jsonFileOutput* jfo; + char* outputRoot; + uint64_t sampleIntervalUsecs; + pthread_t pthread; + pthread_rwlock_t guard; + + inspectorDumpThread(const char* outputRoot, uint64_t sampleIntervalUsecs) + : jfo(nullptr), outputRoot(strdup(outputRoot)), sampleIntervalUsecs(sampleIntervalUsecs) { + if (inspectorLockInit(&guard) != inspectorSuccess) { + INFO(NCCL_INSPECTOR, "NCCL Inspector inspectorDumpThread: couldn't init lock"); + } + } + + ~inspectorDumpThread() { + if (jfo != nullptr) { + jsonFinalizeFileOutput(jfo); + jfo = nullptr; + } + if (outputRoot != nullptr) { + free(outputRoot); + outputRoot = nullptr; + } + if (inspectorLockDestroy(&guard) != inspectorSuccess) { + INFO(NCCL_INSPECTOR, "NCCL Inspector inspectorDumpThread: couldn't destroy lock"); + } + } + + void startThread() { + inspectorLockWr(&guard); + run = true; + inspectorUnlockRWLock(&guard); + if (pthread_create(&pthread, NULL, dumpMain, this) != 0) { + INFO(NCCL_INSPECTOR, + "NCCL Inspector inspectorDumpThread: couldn't create dump thread!"); + return; + } + INFO(NCCL_INSPECTOR, "NCCL Inspector inspectorDumpThread: created"); + } + + void stopThread() { + INFO(NCCL_ENV, "NCCL Inspector Stopping Dump thread"); + inspectorLockWr(&guard); + run = false; + inspectorUnlockRWLock(&guard); + struct timespec ts; + ts.tv_sec = 0; + ts.tv_nsec = 1000000; // 1ms + nanosleep(&ts, NULL); + INFO(NCCL_INSPECTOR, "NCCL Inspector inspectorDumpThread: stopped"); + } + + inspectorResult_t inspectorStateDump(const char* output_root) { + if (!ncclInspectorInit) { + return inspectorUninitializedError; + } + if (!enableNcclInspector) { + INFO(NCCL_INSPECTOR, "NCCL Inspector is not enabled, will not do ncclAllCommTallyDump"); + return inspectorDisabledError; + } + + if (jfo == 0) { + char hostname[256]; + gethostname(hostname, 255); + char tmp[2048]; + snprintf(tmp, sizeof(tmp), "%s/%s-pid%d.log", output_root, hostname, getpid()); + jsonResult_t result = jsonInitFileOutput(&jfo, tmp); + if (jsonSuccess != result) { + INFO(NCCL_INSPECTOR, "Cannot open %s for writing: %s", tmp, jsonErrorString(result)); + return inspectorFileOpenError; + } + chmod(tmp, 0666); + } + + if (jfo != nullptr) { + inspectorCommInfoListDump(jfo, &g_state.liveComms); + inspectorCommInfoListDump(jfo, &g_state.deletedComms); + } + + if (g_state.deletedComms.ncomms > 0) { + inspectorCommInfoListFinalize(&g_state.deletedComms); + } + return inspectorSuccess; + } + + static void* dumpMain(void* arg) { + inspectorDumpThread* dumper = (inspectorDumpThread*)arg; + inspectorResult_t res = inspectorSuccess; + struct timespec ts; + ts.tv_sec = dumper->sampleIntervalUsecs / 1000000; + ts.tv_nsec = dumper->sampleIntervalUsecs % 1000000; + + while (dumper->run) { + inspectorLockWr(&dumper->guard); + if (!dumper->run) { + inspectorUnlockRWLock(&dumper->guard); + break; + } + res = dumper->inspectorStateDump(dumper->outputRoot); + if (res == inspectorFileOpenError || res == inspectorDisabledError) { + inspectorUnlockRWLock(&dumper->guard); + break; + } + inspectorUnlockRWLock(&dumper->guard); + + nanosleep(&ts, NULL); + } + + return 0; + } +}; + +/* + * Description: + * + * Shows the NCCL Inspector plugin version and configuration + * environment variables in a structured format similar to NCCL's + * showVersion function. + * + * Thread Safety: + * Thread-safe (read-only environment variable access). + * + * Input: + * None. + * + * Output: + * Logs version and environment variables to debug output. + * + * Return: + * None. + */ +static void showInspectorVersion() { + VERSION("NCCL Inspector Plugin - Version: %s", get_git_version_info()); +} + +/* + * Description: + * + * Shows all NCCL Inspector environment variables and their values + * in a structured format. + * + * Thread Safety: + * Thread-safe (read-only environment variable access). + * + * Input: + * None. + * + * Output: + * Logs environment variables to debug output. + * + * Return: + * None. + */ +static void showInspectorEnvVars() { + struct { + const char* name; + const char* value; + const char* defaultVal; + const char* description; + } envVars[] = { + {"NCCL_INSPECTOR_ENABLE", getenv("NCCL_INSPECTOR_ENABLE"), "0", "Enable/disable inspector plugin"}, + {"NCCL_INSPECTOR_DUMP_THREAD_ENABLE", getenv("NCCL_INSPECTOR_DUMP_THREAD_ENABLE"), "1", "Enable/disable dump thread"}, + {"NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS", getenv("NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS"), "0", "Dump thread interval in microseconds"}, + {"NCCL_INSPECTOR_DUMP_DIR", getenv("NCCL_INSPECTOR_DUMP_DIR"), "(auto-generated)", "Output directory for inspector logs"}, + {"NCCL_INSPECTOR_DUMP_VERBOSE", getenv("NCCL_INSPECTOR_DUMP_VERBOSE"), "0", "Enable/disable verbose dumping (event_trace)"} + }; + + const int numEnvVars = sizeof(envVars) / sizeof(envVars[0]); + + VERSION("NCCL Inspector Environment Variables:"); + for (int i = 0; i < numEnvVars; i++) { + VERSION(" %s = %s%s%s", + envVars[i].name, + envVars[i].value ? envVars[i].value : "(not set)", + envVars[i].value ? "" : ", default=", + envVars[i].value ? "" : envVars[i].defaultVal); + } +} + +/* + * Description: + * + * Initializes the global inspector state and starts the dump thread + * if enabled. + * + * Thread Safety: + * + * Not thread-safe (should be called during initialization). + * + * Input: + * None. + * + * Output: + * Global state is initialized and dump thread may be started. + * + * Return: + * inspectorResult_t - success or error code. + */ +inspectorResult_t inspectorGlobalInit(int rank) { + char* str = getenv("NCCL_INSPECTOR_ENABLE"); + int enable = str ? atoi(str) : 0; // default disable + enableNcclInspector = enable == 0 ? false : true; + ncclInspectorInit = true; + + // Show version and environment configuration (similar to NCCL's showVersion) + if (rank == 0) { + showInspectorVersion(); + showInspectorEnvVars(); + } + + if (enableNcclInspector == false) { + VERSION("NCCL Inspector Plugin DISABLED (NCCL_INSPECTOR_ENABLE=%s)", + str ? str : "0"); + return inspectorDisabledError; + } + + INS_CHK(inspectorGlobalStateInit()); + + str = getenv("NCCL_INSPECTOR_DUMP_THREAD_ENABLE"); + enable = str ? atoi(str) : 1; // default enable + enableNcclInspectorDumpThread = enable == 0 ? false : true; + + str = getenv("NCCL_INSPECTOR_DUMP_VERBOSE"); + enable = str ? atoi(str) : 0; // default disable + enableNcclInspectorDumpVerbose = enable == 0 ? false : true; + + if (enableNcclInspectorDumpThread) { + str = getenv("NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS"); + const uint64_t interval = str ? strtoull(str, 0, 0) : 0; + + if (interval == 0) { + INFO(NCCL_INSPECTOR, "NCCL Inspector: dump thread enabled but " + "NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS is 0; not " + "starting internal dump " + "thread."); + return inspectorSuccess; + } + + char* dumpdir; + genDumpDir(&dumpdir); + + if (dumpdir != nullptr) { + if (!ensureDir(dumpdir)) { + free(dumpdir); + INFO(NCCL_INSPECTOR, "NCCL Inspector: failed to generate a dump dir; not " + "starting internal dump thread."); + return inspectorSuccess; + } + + dumper = new inspectorDumpThread(dumpdir, interval); + dumper->startThread(); + + INFO(NCCL_INSPECTOR, + "NCCL Inspector enabled with polling interval %lu us and " + "output directory %s", + interval, dumpdir); + free(dumpdir); + } else { + INFO(NCCL_INSPECTOR, "NCCL Inspector: failed to generate a dump " + "dir; not starting internal dump thread."); + } + } else { + INFO(NCCL_INSPECTOR, + "NCCL Inspector: NCCL_INSPECTOR_DUMP_THREAD_ENABLE set to 0; not " + "starting internal dump " + "thread."); + } + return inspectorSuccess; +} + +/* + * Description: + * + * Returns a string describing the given inspectorResult_t error + * code. + * + * Thread Safety: + * Thread-safe (read-only operation). + * + * Input: + * inspectorResult_t result - error code. + * + * Output: + * None. + * + * Return: + * const char* - error string. + */ +const char* inspectorErrorString(inspectorResult_t result) { + switch (result) { + case inspectorSuccess: + return "Success"; + case inspectorUninitializedError: + return "Inspector is not initialized"; + case inspectorMemoryError: + return "Inspector encountered issue allocating memory"; + case inspectorFileOpenError: + return "Inspector could not open file"; + case inspectorDisabledError: + return "Inspector is disabled"; + case inspectorLockError: + return "Inspector encountered error with lock"; + case inspectorPthreadError: + return "Inspector encountered error with pthreads"; + case inspectorJsonError: + return "Inspector encountered error while emitting JSON"; + case inspectorCudaError: + return "Inspector encountered CUDA error"; + case inspectorBadHash: + return "Inspector encountered bad communicator hash"; + case inspectorDeleteUnknownCommError: + return "Inspector was asked to delete a communicator that it is not " + "tracking"; + case inspectorAddDuplicateCommError: + return "Inspector was asked to add a communicator it was already " + "tracking"; + case inspectorNop: + return "Inspector NOP"; + case inspectorNullTally: + return "Inspector encountered a null OpTally"; + case inspectorGlobalInitError: + return "Inspector encountered a repeated global init"; + case inspectorReturn: + return "Inspector Unconditional Return"; + default: + return "Unknown error"; + } +} + +/* + * Description: + * Converts a communicator hash to a string. + * + * Thread Safety: + * Thread-safe (writes to provided buffer). + * + * Input: + * uint64_t commHash - communicator hash. + * char hashStr[NCCL_COMM_HASH_LENGTH] - output buffer. + * + * Output: + * hashStr is set to the string representation of commHash. + * + * Return: + * inspectorResult_t - success or error code. + */ +inspectorResult_t inspectorCommGetHashStr(uint64_t commHash, + char hashStr[NCCL_COMM_HASH_LENGTH]) { + snprintf(hashStr, NCCL_COMM_HASH_LENGTH, "0x%lx", + commHash); + return inspectorSuccess; +} + +/* + * Description: + * Compares two communicator configurations for equality. + * + * Thread Safety: + * Thread-safe (read-only comparison). + * + * Input: + * uint64_t lCommHash - left communicator hash. + * uint64_t rCommHash - right communicator hash. + * int lRank - left rank. + * int rRank - right rank. + * + * Output: + * None. + * + * Return: + * bool - true if communicators are equal (same hash and rank), false otherwise. + */ +static bool comm_eq(uint64_t lCommHash, uint64_t rCommHash, + int lRank, int rRank) { + return lCommHash == rCommHash && lRank == rRank; +} + +/* + * Description: + * Initializes a communicator info structure with the provided parameters. + * + * Thread Safety: + * Not thread-safe - should be called during communicator initialization. + * + * Input: + * struct inspectorCommInfo* commInfo - communicator info structure to initialize (must not be NULL). + * const char* commName - communicator name (can be NULL). + * uint64_t commHash - communicator hash. + * int nnodes - number of nodes (must be > 0). + * int nranks - number of ranks (must be > 0). + * int rank - rank (must be >= 0 and < nranks). + * + * Output: + * commInfo is initialized with the provided parameters. + * + * Return: + * inspectorResult_t - success or error code. + * + * Preconditions: + * - commInfo must not be NULL + * - nnodes must be positive + * - nranks must be positive + * - rank must be non-negative and less than nranks + */ +static inspectorResult_t inspectorFillCommInfo(struct inspectorCommInfo* commInfo, + const char* commName, uint64_t commHash, + int nnodes, int nranks, int rank) { + commInfo->commName = commName; + commInfo->commHash = commHash; + inspectorCommGetHashStr(commHash, commInfo->commHashStr); + commInfo->rank = rank; + commInfo->nranks = nranks; + commInfo->nnodes = nnodes; + commInfo->dump = false; + INS_CHK(inspectorLockInit(&commInfo->guard)); + commInfo->next = nullptr; + return inspectorSuccess; +} + +/* + * Description: + * Adds a communicator to the global state. + * + * Thread Safety: + * Thread-safe (uses locks internally). + * + * Input: + * struct inspectorCommInfo **commInfo - pointer to output struct (must not be NULL). + * const char* commName - communicator name (can be NULL). + * uint64_t commHash - communicator hash. + * int nNodes - number of nodes (must be > 0). + * int nranks - number of ranks (must be > 0). + * int rank - rank (must be >= 0 and < nranks). + * + * Output: + * commInfo is set to the new communicator struct. + * + * Return: + * inspectorResult_t - success or error code. + * + * Preconditions: + * - commInfo must not be NULL + * - nNodes must be positive + * - nranks must be positive + * - rank must be non-negative and less than nranks + */ +inspectorResult_t inspectorAddComm(struct inspectorCommInfo **commInfo, + const char* commName, uint64_t commHash, + int nNodes, int nranks, int rank) { + struct inspectorCommInfoList* liveCommInfoList = &g_state.liveComms; + struct inspectorCommInfo* commInfoPtr = nullptr; + + inspectorResult_t res = inspectorSuccess; + bool locked = false; + INSPECTOR_LOCK_RD_FLAG(&liveCommInfoList->guard, locked, + "inspectorAddComm: commList::guard -rd"); + for (struct inspectorCommInfo* itr = liveCommInfoList->comms; + itr != nullptr; + itr = itr->next) { + if (comm_eq(commHash, itr->commHash, rank, itr->rank)) { + INFO(NCCL_INSPECTOR, "NCCL Inspector: comm 0x%lx already in tracker", + commHash); + res = inspectorAddDuplicateCommError; + goto finalize; + } + } + INSPECTOR_UNLOCK_RW_LOCK_FLAG(&liveCommInfoList->guard, locked, + "inspectorAddComm: commList::guard"); + commInfoPtr + = (struct inspectorCommInfo*)calloc(1, sizeof(struct inspectorCommInfo)); + if (0 == commInfoPtr) { + res = inspectorMemoryError; + goto finalize; + } + INS_CHK_GOTO(inspectorFillCommInfo(commInfoPtr, + commName, + commHash, + nNodes, + nranks, + rank), + res, fail); + + INSPECTOR_LOCK_WR_FLAG(&liveCommInfoList->guard, locked, + "inspectorAddComm: commList::guard -wr"); + ++liveCommInfoList->ncomms; + commInfoPtr->next = liveCommInfoList->comms; + liveCommInfoList->comms = commInfoPtr; + +finalize: + INSPECTOR_UNLOCK_RW_LOCK_FLAG(&liveCommInfoList->guard, locked, + "inspectorAddComm: commList::guard"); + *commInfo = commInfoPtr; + return res; +fail: + if (commInfoPtr) { + free(commInfoPtr); + commInfoPtr = nullptr; + } + goto finalize; +} + +/* + * Description: + * + * Removes a communicator from the global state and moves it to the + * deleted list. + * + * Thread Safety: + * Thread-safe (uses locks internally). + * + * Input: + * struct inspectorCommInfo *commInfo - communicator to remove. + * + * Output: + * Communicator is removed from live list and added to deleted list. + * + * Return: + * inspectorResult_t - success or error code. + */ +inspectorResult_t inspectorDelComm(struct inspectorCommInfo *commInfo) { + struct inspectorCommInfoList* liveCommInfoList = &g_state.liveComms; + struct inspectorCommInfoList* deletedCommInfoList = &g_state.deletedComms; + struct inspectorCommInfo* commInfoPtr = nullptr; + bool locked = false; + + INFO(NCCL_INSPECTOR, "NCCL Inspector: DelComm removing 0x%lx", + commInfo->commHash); + + INSPECTOR_LOCK_WR_FLAG(&liveCommInfoList->guard, locked, + "inspectorDelComm: liveCommInfoList::guard -wr"); + struct inspectorCommInfo** prev_ptr = &liveCommInfoList->comms; + for (struct inspectorCommInfo* itr = liveCommInfoList->comms; + itr != nullptr; + itr = itr->next) { + if (comm_eq(commInfo->commHash, itr->commHash, commInfo->rank, itr->rank)) { + *prev_ptr = itr->next; + liveCommInfoList->ncomms--; + + commInfoPtr = itr; + break; + } + prev_ptr = &itr->next; + } + INSPECTOR_UNLOCK_RW_LOCK_FLAG(&liveCommInfoList->guard, locked, + "inspectorDelComm: liveCommInfoList::guard -unlock"); + + if (!commInfoPtr) { + INFO(NCCL_INSPECTOR, "NCCL Inspector: DelComm can't remove 0x%lx, not present", + commInfo->commHash); + return inspectorDeleteUnknownCommError; + } + + inspectorLockWr(&commInfoPtr->guard); + commInfoPtr->dump = false; + inspectorUnlockRWLock(&commInfoPtr->guard); + + INSPECTOR_LOCK_WR_FLAG(&deletedCommInfoList->guard, locked, + "inspectorDelComm: deletedCommInfoList::guard -wr"); + commInfoPtr->next = deletedCommInfoList->comms; + deletedCommInfoList->comms = commInfoPtr; + deletedCommInfoList->ncomms++; + INSPECTOR_UNLOCK_RW_LOCK_FLAG(&deletedCommInfoList->guard, locked, + "inspectorDelComm: deletedCommInfoList::guard -unlock"); + + return inspectorSuccess; +} + +/* + * Description: + * + * Computes the algorithmic and bus bandwidth (in GB/s) for a given + * NCCL collective operation, based on the communication info and + * completed collective details. The calculation uses the message + * size, execution time, and the type of collective operation to + * determine the effective bandwidths. The 'factor' variable adjusts + * the bus bandwidth calculation according to the communication + * pattern of each collective, as described in the NCCL performance + * documentation: + * https://github.com/NVIDIA/nccl-tests/blob/master/doc/PERFORMANCE.md + * + * Thread Safety: + * + * This function does not perform any locking and assumes the caller + * ensures thread safety if required. + * + * Input: + * + * commInfo - Pointer to inspectorCommInfo structure containing + * communicator details. + * + * completedColl- Pointer to inspectorCompletedCollInfo structure + * containing completed collective info. + * + * collType - The type of NCCL collective operation (ncclFunc_t). + * + * Output: + * Updates the algoBwGbs and busBwGbs fields of the completedColl + * structure. + * + * Return: + * N.A. (void function) + */ +void inspectorComputeCollBw(struct inspectorCommInfo *commInfo, + struct inspectorCompletedCollInfo *completedColl, + ncclFunc_t collType) { + double timeInSec = completedColl->execTimeUsecs / 1000000.0; + double factor = 0.0; + double trafficSize = 0.0; + switch (collType) { + case ncclFuncReduce: + case ncclFuncBroadcast: + trafficSize = (double)completedColl->msgSizeBytes; + factor = 1; + break; + case ncclFuncAllReduce: + trafficSize = (double)completedColl->msgSizeBytes; + factor = ((double)(2 * (commInfo->nranks - 1))) / ((double)commInfo->nranks); + break; + case ncclFuncReduceScatter: + trafficSize = (double)(completedColl->msgSizeBytes * commInfo->nranks); + factor = ((double)(commInfo->nranks - 1)) / ((double)commInfo->nranks); + break; + case ncclFuncAllGather: + trafficSize = (double)(completedColl->msgSizeBytes * commInfo->nranks); + factor = ((double)(commInfo->nranks - 1)) / ((double)commInfo->nranks); + break; + case ncclFuncSendRecv: + case ncclFuncSend: + case ncclFuncRecv: + trafficSize = (double)completedColl->msgSizeBytes; + factor = 1; + break; + default: + trafficSize = 0; + factor = 0.0; + } + completedColl->algoBwGbs = timeInSec != 0 ? (trafficSize / 1.0E9 / timeInSec) : 0; + completedColl->busBwGbs = completedColl->algoBwGbs * factor; +} + +/* + * Description: + * + * Helper function to calculate kernel execution time using GPU + * clock values. The GPU clock values are measured in nanoseconds + * from the globaltimer register. + * + * Thread Safety: + * Thread-safe (read-only operations on kernel info). + * + * Input: + * struct inspectorKernelChInfo *kernelCh - kernel channel info + * containing GPU clock values. + * + * Output: + * None. + * + * Return: + * uint64_t - execution time in microseconds, or 0 if invalid timing + * data. + */ +static uint64_t calculateKernelGpuExecTimeUsecs(struct inspectorKernelChInfo *kernelCh) { + if (kernelCh->startGpuClk != 0 && kernelCh->stopGpuClk != 0) { + if (kernelCh->stopGpuClk > kernelCh->startGpuClk) { + uint64_t execTimeNanosecs = kernelCh->stopGpuClk - kernelCh->startGpuClk; + return execTimeNanosecs / 1000; + } + } + return 0; +} + +/* + * Description: + * + * Calculates the maximum kernel execution time across all kernel + * channels in a collective operation, using GPU clock values when + * available and falling back to CPU timestamps when necessary. + * + * Thread Safety: + * Thread-safe (read-only operations on collective info). + * + * Input: + * struct inspectorCollInfo *collInfo - collective operation info + * containing kernel channels. + * inspectorTimingSource_t *timingSource - pointer to store the timing source used. + * + * Output: + * timingSource is set to indicate whether GPU, CPU, or collective timing was used. + * + * Return: + * + * uint64_t - maximum execution time in microseconds across all + * kernels, or collective execution time if no kernel + * timing is available. + * + */ +static uint64_t calculateMaxKernelExecTimeUsecs(struct inspectorCollInfo *collInfo, + inspectorTimingSource_t *timingSource) { + uint64_t maxKernelExecTimeUsecs = 0; + inspectorTimingSource_t bestTimingSource = inspectorTimingSourceCollectiveCpu; + + for (uint32_t i = 0; i < collInfo->nChannels; i++) { + struct inspectorKernelChInfo *kernelCh = &collInfo->kernelCh[i]; + uint64_t gpuExecTimeUsecs = calculateKernelGpuExecTimeUsecs(kernelCh); + if (gpuExecTimeUsecs > 0) { + if (gpuExecTimeUsecs > maxKernelExecTimeUsecs) { + maxKernelExecTimeUsecs = gpuExecTimeUsecs; + bestTimingSource = inspectorTimingSourceKernelGpu; + } + } else { + if (kernelCh->tsCompletedUsec > kernelCh->tsStartUsec) { + uint64_t cpuExecTimeUsecs = kernelCh->tsCompletedUsec - kernelCh->tsStartUsec; + if (cpuExecTimeUsecs > maxKernelExecTimeUsecs) { + maxKernelExecTimeUsecs = cpuExecTimeUsecs; + bestTimingSource = inspectorTimingSourceKernelCpu; + } + } + } + } + + if (maxKernelExecTimeUsecs > 0) { + *timingSource = bestTimingSource; + return maxKernelExecTimeUsecs; + } else { + *timingSource = inspectorTimingSourceCollectiveCpu; + return collInfo->tsCompletedUsec - collInfo->tsStartUsec; + } +} + +/* + * Description: + * + * Updates the performance information for a completed collective + * operation. + * + * Thread Safety: + * Thread-safe (uses locks internally). + * + * Input: + * struct inspectorCommInfo *commInfo - communicator info. + * struct inspectorCollInfo *collInfo - completed collective info. + * + * Output: + * commInfo is updated with completed collective info. + * + * Return: + * None. + * + */ +void inspectorUpdateCollPerf(struct inspectorCompletedCollInfo *completedColl, + struct inspectorCollInfo *collInfo) { + completedColl->func = ncclStringToFunc(collInfo->func); + completedColl->sn = collInfo->sn; + completedColl->msgSizeBytes = collInfo->msgSizeBytes; + completedColl->execTimeUsecs = + calculateMaxKernelExecTimeUsecs(collInfo, &completedColl->timingSource); + completedColl->collEvtTrk = collInfo->collEvtTrk; +} + +/* + * Description: + * + * Finalizes the global inspector state and stops the dump thread if + * running. + * + * Thread Safety: + * Not thread-safe (should be called during teardown). + * + * Input: + * None. + * + * Output: + * Global state is finalized and dump thread is stopped. + * + * Return: + * inspectorResult_t - success or error code. + * + */ +inspectorResult_t inspectorGlobalFinalize() { + if (dumper) { + dumper->stopThread(); + delete dumper; + dumper = nullptr; + } + return inspectorSuccess; +} diff --git a/ext-profiler/inspector/inspector.h b/ext-profiler/inspector/inspector.h new file mode 100644 index 000000000..98e050f97 --- /dev/null +++ b/ext-profiler/inspector/inspector.h @@ -0,0 +1,198 @@ +#pragma once + +#include + +#include "json.h" +#include "common.h" +#include "version.h" + +#define MAX_CHANNELS 64 + +#define INS_CHK_GOTO(call, res, label) \ + do { \ + res = call; \ + if (inspectorSuccess != res) { \ + INFO(NCCL_INSPECTOR, "%s:%d -> error %d: %s", __FILE__, __LINE__, res, \ + inspectorErrorString(res)); \ + goto label; \ + } \ + } while (0); + + +typedef enum { + ncclFuncBroadcast = 0, + ncclFuncReduce = 1, + ncclFuncAllGather = 2, + ncclFuncReduceScatter = 3, + ncclFuncAllReduce = 4, + ncclFuncSendRecv = 5, + ncclFuncSend = 6, + ncclFuncRecv = 7, + ncclNumFuncs = 8 +} ncclFunc_t; + +typedef enum { + inspectorSuccess = 0, + inspectorUninitializedError, + inspectorMemoryError, + inspectorFileOpenError, + inspectorDisabledError, + inspectorLockError, + inspectorPthreadError, + inspectorJsonError, + inspectorCudaError, + inspectorBadHash, + inspectorDeleteUnknownCommError, + inspectorAddDuplicateCommError, + inspectorNop, + inspectorNullTally, + inspectorGlobalInitError, + inspectorReturn, +} inspectorResult_t; + +typedef enum { + inspectorTimingSourceKernelGpu = 0, + inspectorTimingSourceKernelCpu = 1, + inspectorTimingSourceCollectiveCpu = 2, +} inspectorTimingSource_t; + +struct inspectorEventTraceInfo { + uint64_t ts; + uint64_t sn; +}; + +typedef enum { + NCCL_INSP_EVT_TRK_COLL_START = 0, + NCCL_INSP_EVT_TRK_COLL_STOP = 1, + NCCL_INSP_EVT_TRK_COLL_NEVT = 2, +} inspectorEventTrkColl_t; + +typedef enum { + NCCL_INSP_EVT_TRK_KERNEL_START = 0, + NCCL_INSP_EVT_TRK_KERNEL_STOP = 1, + NCCL_INSP_EVT_TRK_KERNEL_RECORD = 2, + NCCL_INSP_EVT_TRK_KERNEL_NEVT = 3, +} inspectorEventTrkKernel_t; + +struct inspectorEventTrkKernelInfo { + struct inspectorEventTraceInfo evntTrace[NCCL_INSP_EVT_TRK_KERNEL_NEVT]; +}; + +struct inspectorEventTrkCollInfo { + int sn; + uint32_t nChannels; + struct inspectorEventTraceInfo evntTrace[NCCL_INSP_EVT_TRK_COLL_NEVT]; + struct inspectorEventTrkKernelInfo kernelCh[MAX_CHANNELS]; +}; + +struct inspectorCompletedCollInfo { + ncclFunc_t func; + uint64_t sn; + size_t msgSizeBytes; + uint64_t execTimeUsecs; + inspectorTimingSource_t timingSource; + double algoBwGbs; + double busBwGbs; + // Event trace information + struct inspectorEventTrkCollInfo collEvtTrk; +}; + +enum { + NCCL_COMM_HASH_LENGTH = 17 +}; + +struct inspectorCommInfo { + struct inspectorCommInfo* next; + + const char* commName; + uint64_t commHash; + char commHashStr[NCCL_COMM_HASH_LENGTH]; + int rank; + int nranks; + int nnodes; + + bool dump; + struct inspectorCompletedCollInfo completedCollInfo; + pthread_rwlock_t guard; +}; + +struct inspectorKernelChInfo { + uint64_t type; + int refCount; /*unused*/ + struct inspectorCollInfo *collInfo; + uint8_t channelId; + uint64_t tsStartUsec; + uint64_t tsCompletedUsec; + uint64_t startGpuClk; + uint64_t stopGpuClk; +}; + +struct inspectorCollInfo { + uint64_t type; + int refCount; + struct inspectorCommInfo *commInfo; + const char* func; + uint64_t sn; + size_t msgSizeBytes; + uint64_t tsStartUsec; + uint64_t tsCompletedUsec; + uint32_t nChannels; + uint32_t nKernelChStarted; + uint32_t nKernelChCompleted; + pthread_rwlock_t guard; + struct inspectorKernelChInfo kernelCh[MAX_CHANNELS]; + struct inspectorEventTrkCollInfo collEvtTrk; +}; + + + +extern ncclDebugLogger_t logFn; +#define VERSION(...) logFn(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) +#define INFO(FLAGS, ...) logFn(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) +#define WARN(...) logFn(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) + +inline int ncclTypeSize(ncclDataType_t type) { + switch (type) { + case ncclInt8: + case ncclUint8: + case ncclFloat8e4m3: + case ncclFloat8e5m2: + return 1; + case ncclFloat16: + case ncclBfloat16: + return 2; + case ncclInt32: + case ncclUint32: + case ncclFloat32: + return 4; + case ncclInt64: + case ncclUint64: + case ncclFloat64: + return 8; + default: + return -1; + } +} + +const char* inspectorErrorString(inspectorResult_t result); + +inspectorResult_t inspectorLockInit(pthread_rwlock_t* lockRef); +inspectorResult_t inspectorLockDestroy(pthread_rwlock_t* lockRef); +inspectorResult_t inspectorLockRd(pthread_rwlock_t* lockRef); +inspectorResult_t inspectorLockWr(pthread_rwlock_t* lockRef); +inspectorResult_t inspectorUnlockRWLock(pthread_rwlock_t* lockRef); +inspectorResult_t inspectorGlobalInit(int rank); +inspectorResult_t inspectorGlobalFinalize(); +uint64_t inspectorGetTime(); +inspectorResult_t inspectorAddComm(struct inspectorCommInfo **commInfo, + const char* commName, uint64_t commHash, + int nNodes, int nranks, int rank); +inspectorResult_t inspectorDelComm(struct inspectorCommInfo *commInfo); + +void inspectorUpdateCollPerf(struct inspectorCompletedCollInfo *completedColl, + struct inspectorCollInfo *collInfo); +ncclDataType_t inspectorStringToDatatype(const char* str); + +void inspectorComputeCollBw(struct inspectorCommInfo *commInfo, + struct inspectorCompletedCollInfo *completedColl, + ncclFunc_t collType); diff --git a/ext-profiler/inspector/inspector_plugin.cc b/ext-profiler/inspector/inspector_plugin.cc new file mode 100644 index 000000000..b1872157d --- /dev/null +++ b/ext-profiler/inspector/inspector_plugin.cc @@ -0,0 +1,493 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "profiler.h" +#include "inspector.h" + +#define __hidden __attribute__ ((visibility("hidden"))) + +static int gInitialized; + +static pthread_mutex_t gLock = PTHREAD_MUTEX_INITIALIZER; + + +/* + * Description: + * Records an event trace with timestamp and sequence number + * + * Thread Safety: + * Not thread-safe - must be called with proper locking. This function + * is designed to be called from within locked sections where the + * collective info structure is already protected. + * + * Input: + * struct inspectorEventTraceInfo* evtTrace - event trace array + * int eventIndex - index in the event trace array (must be valid) + * struct inspectorCollInfo* collInfo - collective info structure (must not be NULL) + * + * Output: + * Event trace is updated with current timestamp and next sequence + * number from collective + * + * Return: + * uint64_t - the sequence number assigned to this event + * + * Preconditions: + * - collInfo must not be NULL + * - eventIndex must be within valid bounds for evtTrace array + * - Function must be called from within a locked section + */ +static uint64_t inspectorRecordEventTrace(struct inspectorEventTraceInfo* evtTrace, + int eventIndex, + struct inspectorCollInfo* collInfo) { + evtTrace[eventIndex].ts = inspectorGetTime(); + evtTrace[eventIndex].sn = ++collInfo->collEvtTrk.sn; // Increment coll sequence counter + + return evtTrace[eventIndex].sn; +} + +/* + * Description: + * + * Initializes the NCCL Inspector plugin and global state for a + * communicator. + * + * Thread Safety: + * Thread-safe (uses mutex for initialization). + * + * Input: + * void** context - pointer to plugin context. + * int* eActivationMask - pointer to activation mask output. + * const char* commName - communicator name. + * uint64_t commHash - communicator hash. + * int nNodes - number of nodes. + * int nranks - number of ranks. + * int rank - rank. + * ncclDebugLogger_t logfn - logger function pointer. + * + * Output: + * context is set to plugin context; eActivationMask is set. + * + * Return: + * ncclResult_t - success or error code. + * + */ +__hidden ncclResult_t inspectorPluginInit(void** context, uint64_t commHash, + int* eActivationMask, + const char* commName, + int nNodes, int nranks, int rank, + ncclDebugLogger_t logfn) { + inspectorResult_t res = inspectorSuccess; + *context = nullptr; + logFn = logfn; + + pthread_mutex_lock(&gLock); + if (++gInitialized == 1) { + res = inspectorGlobalInit(rank); + if (res != inspectorSuccess) { + WARN("Inspector Init Failed %s:%d -> error %d: %s",__FILE__, __LINE__, res, + inspectorErrorString(res)); + gInitialized = 0; + pthread_mutex_unlock(&gLock); + return ncclInternalError; + } + } + pthread_mutex_unlock(&gLock); + + INS_CHK_GOTO(inspectorAddComm((struct inspectorCommInfo **)context, + commName, commHash, + nNodes, nranks, rank), res, success); + *eActivationMask = ncclProfileColl | ncclProfileKernelCh; + INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d", + commName ? commName : "", commHash, nranks, rank); +success: + if (res != inspectorSuccess) { + return ncclInternalError; + } else { + return ncclSuccess; + } +} + +/* + * Description: + * + * Finalizes the NCCL Inspector plugin and global state for a + * communicator. + * + * Thread Safety: + * Thread-safe (uses mutex for finalization). + * + * Input: + * void* context - plugin context. + * + * Output: + * Plugin context is finalized and cleaned up. + * + * Return: + * ncclResult_t - success or error code. + * + */ +__hidden ncclResult_t inspectorPluginFinalize(void* context) { + inspectorDelComm((struct inspectorCommInfo *)context); + pthread_mutex_lock(&gLock); + if (--gInitialized == 0) { + inspectorGlobalFinalize(); + } + pthread_mutex_unlock(&gLock); + return ncclSuccess; +} + +inspectorResult_t inspectorPluginCollInfoRef(struct inspectorCollInfo *collInfo) { + collInfo->refCount += 1; + return inspectorSuccess; +} + +inspectorResult_t inspectorPluginCollInfoRefSafe(struct inspectorCollInfo *collInfo) { + inspectorLockWr(&collInfo->guard); + inspectorPluginCollInfoRef(collInfo); + inspectorUnlockRWLock(&collInfo->guard); + return inspectorSuccess; +} + +inspectorResult_t inspectorPluginCollInfoDeRef(struct inspectorCollInfo *collInfo) { + collInfo->refCount -= 1; + if (collInfo->refCount == 0) { + inspectorLockDestroy(&collInfo->guard); + memset(collInfo, 0, sizeof(struct inspectorCollInfo)); + free(collInfo); + return inspectorReturn; + } + return inspectorSuccess; +} + +inspectorResult_t inspectorPluginCollInfoDeRefSafe(struct inspectorCollInfo *collInfo) { + inspectorLockWr(&collInfo->guard); + inspectorResult_t res = inspectorPluginCollInfoDeRef(collInfo); + inspectorUnlockRWLock(&collInfo->guard); + return res; +} + +/* + * Description: + * Initializes a new inspectorCollInfo structure for a collective + * event. + * + * Thread Safety: + * Not thread-safe (allocates and initializes a new collective info + * structure). + * + * Input: + * + * struct inspectorCollInfo **collInfo - pointer to output + * collective info struct. + * ncclProfilerEventDescr_t *eDescr - event descriptor. + * + * Output: + * collInfo is set to the new collective info struct. + * + * Return: + * None. + */ +static void inspectorPluginCollInfoInit(struct inspectorCollInfo **collInfo, + ncclProfilerEventDescr_t *eDescr, + struct inspectorCommInfo *commInfo) { + struct inspectorCollInfo *collInfoPtr + = (struct inspectorCollInfo*)calloc(1, sizeof(struct inspectorCollInfo)); + if (collInfoPtr == nullptr) { + WARN("Inspector: Failed to allocate memory for collective info structure"); + *collInfo = nullptr; + return; + } + collInfoPtr->type = ncclProfileColl; + collInfoPtr->refCount = 0; + inspectorPluginCollInfoRef(collInfoPtr); //self ref; no locks needed + collInfoPtr->func = eDescr->coll.func; + collInfoPtr->sn = eDescr->coll.seqNumber; + collInfoPtr->nChannels = eDescr->coll.nChannels; + if (collInfoPtr->nChannels > 0) { + inspectorPluginCollInfoRef(collInfoPtr); //extra ref for kernel completion + } + collInfoPtr->tsStartUsec = inspectorGetTime(); + collInfoPtr->msgSizeBytes = + ncclTypeSize(inspectorStringToDatatype(eDescr->coll.datatype)) * eDescr->coll.count; + + + collInfoPtr->commInfo = commInfo; + collInfoPtr->collEvtTrk.sn = 0; + collInfoPtr->collEvtTrk.nChannels = collInfoPtr->nChannels; + inspectorRecordEventTrace(collInfoPtr->collEvtTrk.evntTrace, + NCCL_INSP_EVT_TRK_COLL_START, collInfoPtr); + + inspectorLockInit(&collInfoPtr->guard); + *collInfo = collInfoPtr; +} + +/* + * Description: + * + * Initializes a new inspectorKernelChInfo structure for a kernel + * channel event. + * + * Thread Safety: + * Not thread-safe (initializes kernel channel info within a + * collective info structure). + * + * Input: + * struct inspectorKernelChInfo **kernelChInfo - pointer to output + * kernel channel info struct. + * ncclProfilerEventDescr_t *eDescr - event descriptor. + * + * Output: + * + * kernelChInfo is set to the new kernel channel info struct. + * + * Return: + * None. + */ +static void inspectorPluginKernelChInfoInit(struct inspectorKernelChInfo **kernelChInfo, + ncclProfilerEventDescr_t *eDescr) { + if (eDescr->parentObj) { + uint64_t parentType=*(uint64_t*)eDescr->parentObj; + if (parentType == ncclProfileColl) { + struct inspectorCollInfo *collInfo = (struct inspectorCollInfo*)eDescr->parentObj; + if (collInfo && collInfo->type == ncclProfileColl) { + inspectorLockWr(&collInfo->guard); + struct inspectorEventTraceInfo *krnlEvtTrk = + collInfo->collEvtTrk.kernelCh[eDescr->kernelCh.channelId].evntTrace; + inspectorRecordEventTrace(krnlEvtTrk, + NCCL_INSP_EVT_TRK_KERNEL_START, + collInfo); + struct inspectorKernelChInfo *kernelChInfoPtr + = &collInfo->kernelCh[eDescr->kernelCh.channelId]; + kernelChInfoPtr->type = ncclProfileKernelCh; + kernelChInfoPtr->channelId = eDescr->kernelCh.channelId; + kernelChInfoPtr->startGpuClk = eDescr->kernelCh.pTimer; + if (kernelChInfoPtr->stopGpuClk == 0) { + inspectorPluginCollInfoRef(collInfo); //Pairs with Record Kernel Stop event + } + kernelChInfoPtr->tsStartUsec = inspectorGetTime(); + if (collInfo->nKernelChStarted == 0) { + collInfo->tsStartUsec = kernelChInfoPtr->tsStartUsec; + } + collInfo->nKernelChStarted += 1; + inspectorPluginCollInfoRef(collInfo); //Pairs with Stop Kernel Event + kernelChInfoPtr->collInfo = collInfo; + + *kernelChInfo = kernelChInfoPtr; + inspectorUnlockRWLock(&collInfo->guard); + } + } + } +} +/* + * Description: + * + * Starts a profiling event for the NCCL Inspector plugin. + * + * Thread Safety: + * Thread-safe (allocates and initializes event structures). + * + * Input: + * void* context - plugin context. + * void** eHandle - pointer to event handle output. + * ncclProfilerEventDescr_t* eDescr - event descriptor. + * + * Output: + * eHandle is set to the new event structure. + * + * Return: + * ncclResult_t - success or error code. + * + */ +__hidden ncclResult_t inspectorPluginStartEvent(void* context, + void** eHandle, + ncclProfilerEventDescr_t* eDescr) { + if (context == nullptr || eDescr == nullptr) { + INFO(NCCL_INIT, "Profiler/Plugin: context/eDescr NULL for start event %s", __func__); + return ncclSuccess; + } + *eHandle = nullptr; + if (eDescr->type == ncclProfileColl) { + struct inspectorCollInfo *collEvent = nullptr; + struct inspectorCommInfo *commInfoCtx = (struct inspectorCommInfo*)context; + inspectorPluginCollInfoInit(&collEvent, eDescr, commInfoCtx); + *eHandle = collEvent; + } else if (eDescr->type == ncclProfileKernelCh) { + struct inspectorKernelChInfo *kernelChEvent = nullptr; + inspectorPluginKernelChInfoInit(&kernelChEvent, eDescr); + *eHandle = kernelChEvent; + } else { + return ncclSuccess; + } + return ncclSuccess; +} + +/* + * Description: + * + * Stops a profiling event for the NCCL Inspector plugin. + * + * Thread Safety: + * + * Thread-safe (updates event state and performance info). + * + * Input: + * + * void *eHandle - event handle. + * + * Output: + * + * Event is stopped and performance info may be updated. + * + * Return: + * ncclResult_t - success or error code. + * + */ +__hidden ncclResult_t inspectorPluginStopEvent(void *eHandle) { + + if (eHandle == nullptr) { + INFO(NCCL_INIT, + "Profiler/Plugin: Event Handle NULL for start event %s", __func__); + return ncclSuccess; + } + uint64_t type = *(uint64_t *)eHandle; + inspectorResult_t res = inspectorSuccess; + + if (type == ncclProfileColl) { + struct inspectorCollInfo *collInfo = (struct inspectorCollInfo *)eHandle; + // Record collective stop event + inspectorLockWr(&collInfo->guard); + inspectorRecordEventTrace(collInfo->collEvtTrk.evntTrace, + NCCL_INSP_EVT_TRK_COLL_STOP, + collInfo); + res = inspectorPluginCollInfoDeRef(collInfo); + if (res == inspectorReturn) { + // WARN("NCCL Inspector unnatural return: inspectorPluginStopEvent:ncclProfileColl"); + return ncclSuccess; + } + inspectorUnlockRWLock(&collInfo->guard); + return ncclSuccess; + } else if (type == ncclProfileKernelCh) { + struct inspectorKernelChInfo *kernelChInfo + = (struct inspectorKernelChInfo *)eHandle; + struct inspectorCollInfo *collInfo = kernelChInfo->collInfo; + if (collInfo && collInfo->type == ncclProfileColl) { + inspectorLockWr(&collInfo->guard); + struct inspectorEventTraceInfo *krnlEvtTrk = + collInfo->collEvtTrk.kernelCh[kernelChInfo->channelId].evntTrace; + inspectorRecordEventTrace(krnlEvtTrk, + NCCL_INSP_EVT_TRK_KERNEL_STOP, + collInfo); + kernelChInfo->tsCompletedUsec = inspectorGetTime(); + collInfo->nKernelChCompleted += 1; + + res = inspectorPluginCollInfoDeRef(collInfo); + if (res == inspectorReturn) { + WARN("NCCL Inspector unnatural return: inspectorPluginStopEvent:ncclProfileKernelCh"); + return ncclSuccess; + } + if ((collInfo->nKernelChCompleted == collInfo->nKernelChStarted) + && (collInfo->nKernelChCompleted == collInfo->nChannels)) { + struct inspectorCompletedCollInfo completedColl; + struct inspectorCommInfo *commInfo = collInfo->commInfo; + collInfo->tsCompletedUsec = kernelChInfo->tsCompletedUsec; + inspectorUpdateCollPerf(&completedColl, collInfo); + + res = inspectorPluginCollInfoDeRef(collInfo); + if (res != inspectorReturn) { + inspectorUnlockRWLock(&collInfo->guard); + } + if (commInfo != nullptr) { + inspectorLockWr(&commInfo->guard); + inspectorComputeCollBw(commInfo, + &completedColl, + completedColl.func); + memcpy(&commInfo->completedCollInfo, + &completedColl, + sizeof(struct inspectorCompletedCollInfo)); + commInfo->dump = true; + inspectorUnlockRWLock(&commInfo->guard); + } + return ncclSuccess; + } + inspectorUnlockRWLock(&collInfo->guard); + } + return ncclSuccess; + } + return ncclSuccess; +} + +/* + * Description: + * + * Records the state of a profiling event for the NCCL Inspector + * plugin. + * + * Thread Safety: + * + * Thread-safe (updates event state as needed). + * + * Input: + * void* eHandle - event handle. + * ncclProfilerEventState_t eState - event state. + * ncclProfilerEventStateArgs_t* eStateArgs - event state arguments. + * + * Output: + * Event state is updated as needed. + * + * Return: + * ncclResult_t - success or error code. + * + */ +__hidden ncclResult_t inspectorPluginRecordEventState(void* eHandle, + ncclProfilerEventState_t eState, + ncclProfilerEventStateArgs_t* eStateArgs) { + if (eHandle == nullptr || eStateArgs == nullptr) + return ncclSuccess; + + uint64_t type = *(uint64_t *)eHandle; + + if (type == ncclProfileKernelCh && eState == ncclProfilerKernelChStop) { + struct inspectorKernelChInfo *kernelChInfo = (struct inspectorKernelChInfo *)eHandle; + struct inspectorCollInfo *collInfo = kernelChInfo->collInfo; + inspectorResult_t res = inspectorSuccess; + if (collInfo && collInfo->type == ncclProfileColl) { + inspectorLockWr(&collInfo->guard); + struct inspectorEventTraceInfo *krnlEvtTrk + = collInfo->collEvtTrk.kernelCh[kernelChInfo->channelId].evntTrace; + inspectorRecordEventTrace(krnlEvtTrk, + NCCL_INSP_EVT_TRK_KERNEL_RECORD, + collInfo); + kernelChInfo->stopGpuClk = eStateArgs->kernelCh.pTimer; + if (kernelChInfo->startGpuClk != 0) { + res = inspectorPluginCollInfoDeRef(collInfo); + if (res == inspectorReturn) { + WARN("NCCL Inspector unnatural return: inspectorPluginRecordEventState"); + return ncclSuccess; + } + } + inspectorUnlockRWLock(&collInfo->guard); + } + } + return ncclSuccess; +} + +ncclProfiler_t ncclProfiler_v5 = { + "Inspector", + inspectorPluginInit, + inspectorPluginStartEvent, + inspectorPluginStopEvent, + inspectorPluginRecordEventState, + inspectorPluginFinalize, +}; diff --git a/ext-profiler/inspector/json.cc b/ext-profiler/inspector/json.cc new file mode 100644 index 000000000..e95d98d18 --- /dev/null +++ b/ext-profiler/inspector/json.cc @@ -0,0 +1,496 @@ +#include "json.h" +#include +#include +#include +#include +#include +#include + +const char* jsonErrorString(jsonResult_t res) { + switch (res) { + case jsonSuccess: + return "jsonSuccess"; + case jsonFileError: + return "jsonFileError"; + case jsonUnknownStateError: + return "jsonUnknownStateError"; + case jsonEmptyStateError: + return "jsonEmptyStateError"; + case jsonExpectedNonNoneStateError: + return "jsonExpectedNonNoneStateError"; + case jsonMemoryError: + return "jsonMemoryError"; + case jsonStringOverflowError: + return "jsonStringOverflowError"; + case jsonStringBadChar: + return "jsonStringBadChar"; + case jsonLockError: + return "jsonLockError"; + default: + return "unknown json error"; + } +} + +// We use these statics to mantain a stack of states where we are writing. +typedef struct jsonFileOutput { + jsonState_t* states; + size_t state_cap; // Allocated stack capacity + size_t state_n; // # of items in the stack. + FILE* fp; + pthread_mutex_t mutex; +} jsonFileOutput; + +jsonResult_t jsonInitFileOutput(jsonFileOutput** jfo, const char* outfile) { + jsonFileOutput* new_jfo = (jsonFileOutput*)malloc(sizeof(jsonFileOutput)); + if (new_jfo == NULL) { + return jsonMemoryError; + } + if (pthread_mutex_init(&new_jfo->mutex, NULL) != 0) { + free(new_jfo); + *jfo = 0; + return jsonLockError; + } + new_jfo->states = NULL; + new_jfo->state_cap = 0; + new_jfo->state_n = 0; + new_jfo->fp = fopen(outfile, "w"); + if (new_jfo->fp == NULL) { + free(new_jfo); + *jfo = 0; + return jsonFileError; + } + *jfo = new_jfo; + return jsonSuccess; +} + +jsonResult_t jsonNewline(jsonFileOutput* jfo) { + fprintf(jfo->fp, "\n"); + return jsonSuccess; +} + +jsonResult_t jsonFlushOutput(jsonFileOutput* jfo) { + fflush(jfo->fp); + return jsonSuccess; +} + +jsonResult_t jsonLockOutput(jsonFileOutput* jfo) { + if (pthread_mutex_lock(&jfo->mutex) != 0) { + return jsonLockError; + } + return jsonSuccess; +} + +jsonResult_t jsonUnlockOutput(jsonFileOutput* jfo) { + if (pthread_mutex_unlock(&jfo->mutex) != 0) { + return jsonLockError; + } + return jsonSuccess; +} + +jsonResult_t jsonFinalizeFileOutput(jsonFileOutput* jfo) { + // Really should probably complain if we aren't in a valid state + + if (pthread_mutex_destroy(&jfo->mutex) != 0) { + free(jfo); + return jsonLockError; + } + if (jfo->states != NULL) { + free(jfo->states); + } + jfo->states = NULL; + jfo->state_cap = 0; + jfo->state_n = 0; + if (jfo->fp) { + fclose(jfo->fp); + jfo->fp = 0; + } + + free(jfo); + return jsonSuccess; +} + +static int utf8copy(unsigned char* out, int out_lim, const unsigned char* in) { + int copy_len; + if ((in[0] & 0xE0) == 0xC0) { + // 2-byte sequence + if ((in[1] & 0xC0) != 0x80 || out_lim < 2) { + return 0; + } + copy_len = 2; + } else if ((in[0] & 0xF0) == 0xE0) { + // 3-byte sequence + if ((in[1] & 0xC0) != 0x80 || (in[2] & 0xC0) != 0x80 || out_lim < 3) { + return 0; + } + copy_len = 3; + } else if ((in[0] & 0xF8) == 0xF0) { + // 4-byte sequence + if ((in[1] & 0xC0) != 0x80 || (in[2] & 0xC0) != 0x80 || (in[3] & 0xC0) != 0x80 || out_lim < 4) { + return 0; + } + copy_len = 4; + } else { + // Invalid start byte + return 0; + } + + for (int i = 0; i < copy_len; ++i) { + out[i] = in[i]; + } + + return copy_len; +} + +// This tries to sanitize/quote a string from 'in' into 'out', +// assuming 'out' has length 'lim'. We mainly quote ",/,\,\t,\n, and +// bail if we encounter non-printable stuff or non-ASCII stuff. +// 'in' should be null-terminated, of course. +// +// We return false if we were not able to copy all of 'in', either for +// length reasons or for unhandled characters. +static jsonResult_t sanitizeJson(unsigned char out[], int lim, const unsigned char* in) { + int c = 0; + while (*in) { + if (c + 1 >= lim) { + out[c] = 0; + return jsonStringOverflowError; + } + switch (*in) { + case '"': + case '\\': + case '/': + case '\t': + case '\n': + if (c + 2 > lim) { + out[c] = 0; + return jsonStringOverflowError; + } + + out[c++] = '\\'; + if (*in == '\n') { + out[c++] = 'n'; + } else if (*in == '\t') { + out[c++] = 't'; + } else { + out[c++] = *in; + } + ++in; + break; + default: + if (*in <= 0x1F) { + out[c] = 0; + return jsonStringBadChar; + } else if (*in <= 0x7F) { + out[c++] = *in; + ++in; + } else { + const int utf8len = utf8copy(out + c, lim - c - 1, in); + if (utf8len == 0) { + out[c] = 0; + return jsonStringBadChar; + } + c += utf8len; + in += utf8len; + } + break; + } + } + out[c] = 0; + return jsonSuccess; +} + +static size_t max(size_t a, size_t b) { + if (a < b) { + return b; + } + return a; +} + +// Push state onto the state stack. Reallocate for extra storage if needed. +// Because JSON_NONE is a pseudo-state, don't allow it to be pushed. +static jsonResult_t jsonPushState(jsonFileOutput* jfo, jsonState_t state) { + if (state == JSON_NONE) { + return jsonExpectedNonNoneStateError; + } + if (jfo->state_cap <= (jfo->state_n + 1)) { + jfo->state_cap = max((size_t)16, jfo->state_cap * 2); + jfo->states = (jsonState_t*)realloc(jfo->states, sizeof(jsonState_t) * jfo->state_cap); + if (jfo->states == 0) { + return jsonMemoryError; + } + } + jfo->states[jfo->state_n++] = state; + return jsonSuccess; +} + +// Return the current state at the top of the stack +static jsonState_t jsonCurrState(const jsonFileOutput* jfo) { + if (jfo->state_n == 0) { + return JSON_NONE; + } + return jfo->states[jfo->state_n - 1]; +} + +// Replace the stack with state (equivalent to a pop & push if stack is not empty) +static jsonResult_t jsonReplaceState(jsonFileOutput* jfo, jsonState_t state) { + if (state == JSON_NONE) { + return jsonExpectedNonNoneStateError; + } + if (jfo->state_n == 0) { + return jsonEmptyStateError; + } + jfo->states[jfo->state_n - 1] = state; + return jsonSuccess; +} + +// Pop the top state off the stack, or return that the state is empty +static jsonState_t jsonPopState(jsonFileOutput* jfo) { + if (jfo->state_n == 0) { + return JSON_NONE; + } + return jfo->states[--jfo->state_n]; +} + +// Emit a key and separator. Santize the key. +// This is only acceptable if the top state is an object +// Emit a ',' separator of we aren't the first item. +jsonResult_t jsonKey(jsonFileOutput* jfo, const char* name) { + switch (jsonCurrState(jfo)) { + case JSON_OBJECT_EMPTY: + jsonReplaceState(jfo, JSON_OBJECT_SOME); + break; + case JSON_OBJECT_SOME: + fprintf(jfo->fp, ","); + break; + default: + return jsonUnknownStateError; + } + unsigned char tmp[2048]; + const jsonResult_t res = sanitizeJson(tmp, sizeof(tmp), (const unsigned char*)name); + if (res != jsonSuccess) { + return res; + } + fprintf(jfo->fp, "\"%s\":", tmp); + jsonPushState(jfo, JSON_KEY); + return jsonSuccess; +} + +// Helper function for inserting values. +// Only acceptable after keys, top-level, or in lists. +// Emit preceeding ',' if in a list and not first item. +static jsonResult_t jsonValHelper(jsonFileOutput* jfo) { + switch (jsonCurrState(jfo)) { + case JSON_LIST_EMPTY: + jsonReplaceState(jfo, JSON_LIST_SOME); + break; + case JSON_LIST_SOME: + fprintf(jfo->fp, ","); + break; + case JSON_KEY: + jsonPopState(jfo); + break; + case JSON_NONE: + break; + default: + return jsonUnknownStateError; + } + return jsonSuccess; +} + +// Start an object +jsonResult_t jsonStartObject(jsonFileOutput* jfo) { + const jsonResult_t res = jsonValHelper(jfo); + if (res != jsonSuccess) { + return res; + } + fprintf(jfo->fp, "{"); + return jsonPushState(jfo, JSON_OBJECT_EMPTY); +} + +// Close an object +jsonResult_t jsonFinishObject(jsonFileOutput* jfo) { + switch (jsonPopState(jfo)) { + case JSON_OBJECT_EMPTY: + case JSON_OBJECT_SOME: + break; + default: + return jsonUnknownStateError; + } + fprintf(jfo->fp, "}"); + return jsonSuccess; +} + +// Start a list +jsonResult_t jsonStartList(jsonFileOutput* jfo) { + const jsonResult_t res = jsonValHelper(jfo); + if (res != jsonSuccess) { + return res; + } + fprintf(jfo->fp, "["); + return jsonPushState(jfo, JSON_LIST_EMPTY); +} + +// Close a list +jsonResult_t jsonFinishList(jsonFileOutput* jfo) { + switch (jsonPopState(jfo)) { + case JSON_LIST_EMPTY: + case JSON_LIST_SOME: + break; + default: + return jsonUnknownStateError; + } + fprintf(jfo->fp, "]"); + return jsonSuccess; +} + +// Write a null value +jsonResult_t jsonNull(jsonFileOutput* jfo) { + const jsonResult_t res = jsonValHelper(jfo); + if (res != jsonSuccess) { + return res; + } + fprintf(jfo->fp, "null"); + return jsonSuccess; +} + +// Write a (sanititzed) string +jsonResult_t jsonStr(jsonFileOutput* jfo, const char* str) { + if (str == NULL) { + jsonNull(jfo); + return jsonSuccess; + } + const jsonResult_t res = jsonValHelper(jfo); + if (res != jsonSuccess) { + return res; + } + unsigned char tmp[2048]; + const jsonResult_t san_res = sanitizeJson(tmp, sizeof(tmp), (const unsigned char*)str); + if (san_res != jsonSuccess) { + return san_res; + } + fprintf(jfo->fp, "\"%s\"", tmp); + return jsonSuccess; +} + +// Write a bool as "true" or "false" strings. +jsonResult_t jsonBool(jsonFileOutput* jfo, bool val) { + return jsonStr(jfo, val ? "true" : "false"); +} + +// Write an integer value +jsonResult_t jsonInt(jsonFileOutput* jfo, const int val) { + const jsonResult_t res = jsonValHelper(jfo); + if (res != jsonSuccess) { + return res; + } + fprintf(jfo->fp, "%d", val); + return jsonSuccess; +} + +// Write an integer value +jsonResult_t jsonUint32(jsonFileOutput* jfo, const uint32_t val) { + const jsonResult_t res = jsonValHelper(jfo); + if (res != jsonSuccess) { + return res; + } + fprintf(jfo->fp, "%u", val); + return jsonSuccess; +} + + +// Write an integer value +jsonResult_t jsonUint64(jsonFileOutput* jfo, const uint64_t val) { + const jsonResult_t res = jsonValHelper(jfo); + if (res != jsonSuccess) { + return res; + } + fprintf(jfo->fp, "%lu", val); + return jsonSuccess; +} + +// Write a size_t value +jsonResult_t jsonSize_t(jsonFileOutput* jfo, const size_t val) { + const jsonResult_t res = jsonValHelper(jfo); + if (res != jsonSuccess) { + return res; + } + fprintf(jfo->fp, "%zu", val); + return jsonSuccess; +} + +// Write a double value +jsonResult_t jsonDouble(jsonFileOutput* jfo, const double val) { + const jsonResult_t res = jsonValHelper(jfo); + if (res != jsonSuccess) { + return res; + } + if (val != val) { + fprintf(jfo->fp, "\"nan\""); + } else { + fprintf(jfo->fp, "%lf", val); + } + return jsonSuccess; +} + +#ifdef DO_JSON_TEST +// compile with +// gcc json.cc -Iinclude/ -DDO_JSON_TEST -o json_test +// run with: +// ./json_test +// if something fails, it will print out the error +// if it all works, print out "output matches reference" +#define JSONCHECK(expr) \ + do { \ + const jsonResult_t res = (expr); \ + if (res != jsonSuccess) { \ + fprintf(stderr, "jsonError: %s\n", jsonErrorString(res)); \ + exit(1); \ + } \ + } while (0) + +int main() { + + const char refstr[] = + "{\"number\":123,\"utfstring\":\"∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ " + "¬β = ¬(¬α ∨ β),\",\"list\":[\"true\",null,9423812381231,3123111,0.694234]}"; + + jsonFileOutput* jfo; + JSONCHECK(jsonInitFileOutput(&jfo, "test.json")); + JSONCHECK(jsonStartObject(jfo)); + JSONCHECK(jsonKey(jfo, "number")); + JSONCHECK(jsonInt(jfo, 123)); + JSONCHECK(jsonKey(jfo, "utfstring")); + JSONCHECK( + jsonStr(jfo, "∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β),")); + JSONCHECK(jsonKey(jfo, "list")); + JSONCHECK(jsonStartList(jfo)); + JSONCHECK(jsonBool(jfo, true)); + JSONCHECK(jsonNull(jfo)); + JSONCHECK(jsonUint64(jfo, 9423812381231ULL)); + JSONCHECK(jsonSize_t(jfo, 3123111)); + JSONCHECK(jsonDouble(jfo, 0.69423413)); + JSONCHECK(jsonFinishList(jfo)); + JSONCHECK(jsonFinishObject(jfo)); + JSONCHECK(jsonFinalizeFileOutput(jfo)); + + FILE* fp = fopen("test.json", "r"); + + const size_t reflen = sizeof(refstr) / sizeof(char); + + char buffer[reflen]; + + fread(buffer, sizeof(char), reflen, fp); + + fclose(fp); + + if (memcmp(buffer, refstr, reflen) == 0) { + printf("output matches reference\n"); + } else { + printf("output %s\nreference %s\n", buffer, refstr); + return 1; + } + + return 0; +} + +#endif diff --git a/ext-profiler/inspector/json.h b/ext-profiler/inspector/json.h new file mode 100644 index 000000000..a0b684843 --- /dev/null +++ b/ext-profiler/inspector/json.h @@ -0,0 +1,83 @@ +#pragma once + +#include +#include +#include + +typedef enum { + JSON_NONE, // A pseudo-state meaning that the document is empty + JSON_KEY, + JSON_OBJECT_EMPTY, + JSON_OBJECT_SOME, + JSON_LIST_EMPTY, + JSON_LIST_SOME, +} jsonState_t; + +typedef enum { + jsonSuccess, + jsonFileError, + jsonUnknownStateError, + jsonEmptyStateError, + jsonExpectedNonNoneStateError, + jsonStringOverflowError, + jsonStringBadChar, + jsonMemoryError, + jsonLockError, +} jsonResult_t; + +const char *jsonErrorString(jsonResult_t res); + +typedef struct jsonFileOutput jsonFileOutput; + +jsonResult_t jsonLockOutput(jsonFileOutput *jfo); + +jsonResult_t jsonUnlockOutput(jsonFileOutput *jfo); + +jsonResult_t jsonInitFileOutput(jsonFileOutput **jfo, + const char *outfile); + +jsonResult_t jsonFinalizeFileOutput(jsonFileOutput *jfo); + +jsonResult_t jsonNewline(jsonFileOutput *jfo); +jsonResult_t jsonFlushOutput(jsonFileOutput *jfo); + +// Emit a key and separator. Santize the key. +// This is only acceptable if the top state is an object +// Emit a ',' separator of we aren't the first item. +jsonResult_t jsonKey(jsonFileOutput *jfo, const char *name); + +// Start an object +jsonResult_t jsonStartObject(jsonFileOutput *jfo); + +// Close an object +jsonResult_t jsonFinishObject(jsonFileOutput *jfo); + +// Start a list +jsonResult_t jsonStartList(jsonFileOutput *jfo); + +// Close a list +jsonResult_t jsonFinishList(jsonFileOutput *jfo); + +// Emit a null value +jsonResult_t jsonNull(jsonFileOutput *jfo); + +// Write a (sanititzed) string +jsonResult_t jsonStr(jsonFileOutput *jfo, const char *str); + +// Write a bool as "true" or "false" strings. +jsonResult_t jsonBool(jsonFileOutput *jfo, bool val); + +// Write an integer value +jsonResult_t jsonInt(jsonFileOutput *jfo, const int val); + +//Write an unsigned int value +jsonResult_t jsonUint32(jsonFileOutput *jfo, const uint32_t val); + +// Write an integer value +jsonResult_t jsonUint64(jsonFileOutput *jfo, const uint64_t val); + +// Write a size_t value +jsonResult_t jsonSize_t(jsonFileOutput *jfo, const size_t val); + +// Write a double value +jsonResult_t jsonDouble(jsonFileOutput *jfo, const double val); diff --git a/ext-profiler/inspector/nccl/common.h b/ext-profiler/inspector/nccl/common.h new file mode 100644 index 000000000..f8ab7e9e6 --- /dev/null +++ b/ext-profiler/inspector/nccl/common.h @@ -0,0 +1,73 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef COMMON_H_ +#define COMMON_H_ + +/* typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; */ +/* typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; */ + +/* Data types */ +typedef enum { ncclInt8 = 0, ncclChar = 0, + ncclUint8 = 1, + ncclInt32 = 2, ncclInt = 2, + ncclUint32 = 3, + ncclInt64 = 4, + ncclUint64 = 5, + ncclFloat16 = 6, ncclHalf = 6, + ncclFloat32 = 7, ncclFloat = 7, + ncclFloat64 = 8, ncclDouble = 8, + ncclBfloat16 = 9, + ncclFloat8e4m3 = 10, + ncclFloat8e5m2 = 11, + ncclNumTypes = 12 +} ncclDataType_t; + +typedef enum { + NCCL_LOG_NONE = 0, + NCCL_LOG_VERSION = 1, + NCCL_LOG_WARN = 2, + NCCL_LOG_INFO = 3, + NCCL_LOG_ABORT = 4, + NCCL_LOG_TRACE = 5 +} ncclDebugLogLevel; + +typedef enum { ncclSuccess = 0, + ncclUnhandledCudaError = 1, + ncclSystemError = 2, + ncclInternalError = 3, + ncclInvalidArgument = 4, + ncclInvalidUsage = 5, + ncclRemoteError = 6, + ncclInProgress = 7, + ncclNumResults = 8 } ncclResult_t; + + +typedef enum { + NCCL_INIT = 0x1, + NCCL_COLL = 0x2, + NCCL_P2P = 0x4, + NCCL_SHM = 0x8, + NCCL_NET = 0x10, + NCCL_GRAPH = 0x20, + NCCL_TUNING = 0x40, + NCCL_ENV = 0x80, + NCCL_ALLOC = 0x100, + NCCL_CALL = 0x200, + NCCL_PROXY = 0x400, + NCCL_NVLS = 0x800, + NCCL_BOOTSTRAP = 0x1000, + NCCL_REG = 0x2000, + NCCL_PROFILE = 0x4000, + NCCL_RAS = 0x8000, + NCCL_INSPECTOR = 0x100000, // big number to avoid short-term conflicts + NCCL_ALL = ~0 +} ncclDebugLogSubSys; + + +typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); + +#endif diff --git a/ext-profiler/inspector/nccl/profiler.h b/ext-profiler/inspector/nccl/profiler.h new file mode 100644 index 000000000..715885f72 --- /dev/null +++ b/ext-profiler/inspector/nccl/profiler.h @@ -0,0 +1,85 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_H_ +#define PROFILER_H_ + +#include +#include + +#include "common.h" + +enum { + ncclProfileGroup = (1 << 0), // group event type + ncclProfileColl = (1 << 1), // host collective call event type + ncclProfileP2p = (1 << 2), // host point-to-point call event type + ncclProfileProxyOp = (1 << 3), // proxy operation event type + ncclProfileProxyStep = (1 << 4), // proxy step event type + ncclProfileProxyCtrl = (1 << 5), // proxy control event type + ncclProfileKernelCh = (1 << 6), // kernel channel event type + ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events + ncclProfileGroupApi = (1 << 8), // Group API events + ncclProfileCollApi = (1 << 9), // Collective API events + ncclProfileP2pApi = (1 << 10), // Point-to-Point API events + ncclProfileKernelLaunch = (1 << 11), // Kernel launch events +}; + +typedef enum { + ncclProfilerProxyOpSendPosted = 0, // deprecated in v4 + ncclProfilerProxyOpSendRemFifoWait = 1, // deprecated in v4 + ncclProfilerProxyOpSendTransmitted = 2, // deprecated in v4 + ncclProfilerProxyOpSendDone = 3, // deprecated in v4 + ncclProfilerProxyOpRecvPosted = 4, // deprecated in v4 + ncclProfilerProxyOpRecvReceived = 5, // deprecated in v4 + ncclProfilerProxyOpRecvTransmitted = 6, // deprecated in v4 + ncclProfilerProxyOpRecvDone = 7, // deprecated in v4 + ncclProfilerProxyOpInProgress_v4 = 19, + + /* Legacy proxy profiler states */ + ncclProfilerProxyStepSendGPUWait = 8, + ncclProfilerProxyStepSendPeerWait_v4 = 20, + ncclProfilerProxyStepSendWait = 9, + ncclProfilerProxyStepRecvWait = 10, + ncclProfilerProxyStepRecvFlushWait = 11, + ncclProfilerProxyStepRecvGPUWait = 12, + + /* Legacy proxy control states */ + ncclProfilerProxyCtrlIdle = 13, + ncclProfilerProxyCtrlActive = 14, + ncclProfilerProxyCtrlSleep = 15, + ncclProfilerProxyCtrlWakeup = 16, + ncclProfilerProxyCtrlAppend = 17, + ncclProfilerProxyCtrlAppendEnd = 18, + + /* Network defined events states */ + ncclProfilerNetPluginUpdate = 21, + + /* Kernel event states */ + ncclProfilerKernelChStop = 22, + + /* Group API States */ + ncclProfilerEndGroupApiStart = 23, + ncclProfilerBeginGroupApiEnd = 24 +} ncclProfilerEventState_t; + +typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t; +typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t; +typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t; +typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t; +typedef ncclProfilerEventState_t ncclProfilerEventState_v5_t; + +#include "profiler_v5.h" +#include "profiler_v4.h" +#include "profiler_v3.h" +#include "profiler_v2.h" +#include "profiler_v1.h" +#include "profiler_net.h" + +typedef ncclProfiler_v5_t ncclProfiler_t; +typedef ncclProfilerEventDescr_v5_t ncclProfilerEventDescr_t; +typedef ncclProfilerEventStateArgs_v5_t ncclProfilerEventStateArgs_t; + +#endif // end include guard diff --git a/ext-profiler/inspector/nccl/profiler_net.h b/ext-profiler/inspector/nccl/profiler_net.h new file mode 100644 index 000000000..4f0a4182c --- /dev/null +++ b/ext-profiler/inspector/nccl/profiler_net.h @@ -0,0 +1,19 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_NET_H_ +#define PROFILER_NET_H_ + +#define NCCL_PROFILER_NET_VER_BITS (16) +#define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS) +#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS) + +typedef enum { + NCCL_PROFILER_NET_TYPE_IB = (1U << NCCL_PROFILER_NET_VER_BITS), + NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS), +} ncclProfilerNetType; + +#endif diff --git a/ext-profiler/inspector/nccl/profiler_v1.h b/ext-profiler/inspector/nccl/profiler_v1.h new file mode 100644 index 000000000..9abcea76d --- /dev/null +++ b/ext-profiler/inspector/nccl/profiler_v1.h @@ -0,0 +1,112 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_V1_H_ +#define PROFILER_V1_H_ + +#include +#include +#include + + +typedef struct { + uint8_t type; // event type descriptor: ncclProfileColl, ... + void* parentObj; // pointer to the profiler parent object (for coll is the group) + int rank; // originating rank + union { + struct { + const char* name; + uint64_t commHash; + uint64_t seqNumber; + uint8_t func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + uint8_t datatype; + uint32_t op; + size_t trafficBytes; + uint8_t nMaxChannels; + uint8_t nWarps; + uint8_t algo; + uint8_t proto; + int isCollnet; + int isNvls; + } coll; + + struct { + const char* name; + uint64_t commHash; + uint8_t func; + void* buff; + uint8_t datatype; + size_t count; + int peer; + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + }; +} ncclProfilerEventDescr_v1_t; + +typedef union { + struct { + size_t transSize; + int steps; + } proxyOp; + + struct { + int appendedProxyOps; + } proxyCtrl; +} ncclProfilerEventStateArgs_v1_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, int* eActivationMask); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v1_t; + +#endif diff --git a/ext-profiler/inspector/nccl/profiler_v2.h b/ext-profiler/inspector/nccl/profiler_v2.h new file mode 100644 index 000000000..6a2699b58 --- /dev/null +++ b/ext-profiler/inspector/nccl/profiler_v2.h @@ -0,0 +1,108 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_V2_H_ +#define PROFILER_V2_H_ + +#include +#include +#include + +typedef struct { + uint8_t type; // event type descriptor: ncclProfileColl, ... + void* parentObj; // pointer to the profiler parent object (for coll is the group) + int rank; // originating rank + union { + struct { + const char* name; + uint64_t commHash; + uint64_t seqNumber; + const char* func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + const char* datatype; + size_t trafficBytes; + uint8_t nMaxChannels; + uint8_t nWarps; + const char* algo; + const char* proto; + } coll; + + struct { + const char* name; + uint64_t commHash; + const char* func; + void* buff; + const char* datatype; + size_t count; + int peer; + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + }; +} ncclProfilerEventDescr_v2_t; + +typedef union { + struct { + size_t transSize; + int steps; + } proxyOp; + + struct { + int appendedProxyOps; + } proxyCtrl; +} ncclProfilerEventStateArgs_v2_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, int* eActivationMask); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v2_t; + +#endif diff --git a/ext-profiler/inspector/nccl/profiler_v3.h b/ext-profiler/inspector/nccl/profiler_v3.h new file mode 100644 index 000000000..d4def08e1 --- /dev/null +++ b/ext-profiler/inspector/nccl/profiler_v3.h @@ -0,0 +1,116 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_V3_H_ +#define PROFILER_V3_H_ + +#include +#include +#include + +typedef struct { + uint8_t type; // event type descriptor: ncclProfileColl, ... + void* parentObj; // pointer to the profiler parent object (for coll is the group) + int rank; // originating rank + union { + struct { + const char* name; + uint64_t commHash; + uint64_t seqNumber; + const char* func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + const char* datatype; + uint8_t nMaxChannels; + uint8_t nWarps; + const char* algo; + const char* proto; + } coll; + + struct { + const char* name; + uint64_t commHash; + const char* func; + void* buff; + const char* datatype; + size_t count; + int peer; + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + + struct { + uint8_t channelId; + } kernelCh; + + struct { + int64_t id; + void* data; + } netPlugin; + }; +} ncclProfilerEventDescr_v3_t; + +typedef union { + struct { + size_t transSize; + int steps; + } proxyOp; + + struct { + int appendedProxyOps; + } proxyCtrl; +} ncclProfilerEventStateArgs_v3_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, int* eActivationMask); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v3_t; + +#endif diff --git a/ext-profiler/inspector/nccl/profiler_v4.h b/ext-profiler/inspector/nccl/profiler_v4.h new file mode 100644 index 000000000..75f00548d --- /dev/null +++ b/ext-profiler/inspector/nccl/profiler_v4.h @@ -0,0 +1,127 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_V4_H_ +#define PROFILER_V4_H_ + +#include +#include +#include + +typedef struct { + uint8_t type; // event type descriptor: ncclProfileColl, ... + void* parentObj; // pointer to the profiler parent object (for coll is the group) + int rank; // originating rank + union { + struct { + uint64_t seqNumber; + const char* func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + const char* datatype; + uint8_t nChannels; + uint8_t nWarps; + const char* algo; + const char* proto; + } coll; + + struct { + const char* func; + void* buff; + const char* datatype; + size_t count; + int peer; + uint8_t nChannels; + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + + struct { + uint8_t channelId; + uint64_t pTimer; // start timestamp from GPU globaltimer + } kernelCh; + + struct { + int64_t id; + void* data; + } netPlugin; + }; +} ncclProfilerEventDescr_v4_t; + +typedef union { + struct { + size_t transSize; + } proxyStep; + + struct { + int appendedProxyOps; + } proxyCtrl; + + struct { + void* data; + } netPlugin; + + struct { + uint64_t pTimer; + } kernelCh; +} ncclProfilerEventStateArgs_v4_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // - commName : user assigned communicator name + // - commHash : communicator id + // - nNodes : number of nodes in communicator + // - nranks : number of ranks in communicator + // - rank : rank identifier in communicator + // - logfn : logger function + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v4_t; + +#endif diff --git a/ext-profiler/inspector/nccl/profiler_v5.h b/ext-profiler/inspector/nccl/profiler_v5.h new file mode 100644 index 000000000..dab1db9e1 --- /dev/null +++ b/ext-profiler/inspector/nccl/profiler_v5.h @@ -0,0 +1,151 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_V5_H_ +#define PROFILER_V5_H_ + +typedef struct { + uint64_t type; // event type descriptor: ncclProfileColl, ... + void* parentObj; // pointer to the profiler parent object (for coll is the group) + int rank; // originating rank + union { + struct { + bool graphCaptured; + int groupDepth; + } groupApi; + + struct { + const char* func; + size_t count; + const char* datatype; + int root; + void* stream; + bool graphCaptured; + } collApi; + + struct { + const char* func; + size_t count; + const char* datatype; + void* stream; + bool graphCaptured; + } p2pApi; + + struct { + void* stream; + } kernelLaunch; + + struct { + uint64_t seqNumber; + const char* func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + const char* datatype; + uint8_t nChannels; + uint8_t nWarps; + const char* algo; + const char* proto; + void* parentGroup; // for backward compatibility with v4 + } coll; + + struct { + const char* func; + void* buff; + const char* datatype; + size_t count; + int peer; + uint8_t nChannels; + void* parentGroup; // for backward compatibility with v4 + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + + struct { + uint8_t channelId; + uint64_t pTimer; // start timestamp from GPU globaltimer + } kernelCh; + + struct { + int64_t id; + void* data; + } netPlugin; + }; +} ncclProfilerEventDescr_v5_t; + +typedef union { + struct { + size_t transSize; + } proxyStep; + + struct { + int appendedProxyOps; + } proxyCtrl; + + struct { + void* data; + } netPlugin; + + struct { + uint64_t pTimer; + } kernelCh; +} ncclProfilerEventStateArgs_v5_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // - commId : communicator id + // - commName : user assigned communicator name + // - nNodes : number of nodes in communicator + // - nranks : number of ranks in communicator + // - rank : rank identifier in communicator + // - logfn : logger function + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v5_t; + +#endif diff --git a/ext-profiler/inspector/nccl/types.h b/ext-profiler/inspector/nccl/types.h new file mode 100644 index 000000000..f43fdc163 --- /dev/null +++ b/ext-profiler/inspector/nccl/types.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NCCL_TYPES_H_ +#define NCCL_TYPES_H_ + +/* Data types */ +typedef enum { ncclInt8 = 0, ncclChar = 0, + ncclUint8 = 1, + ncclInt32 = 2, ncclInt = 2, + ncclUint32 = 3, + ncclInt64 = 4, + ncclUint64 = 5, + ncclFloat16 = 6, ncclHalf = 6, + ncclFloat32 = 7, ncclFloat = 7, + ncclFloat64 = 8, ncclDouble = 8, + ncclBfloat16 = 9, +} ncclDataType_t; + +#endif diff --git a/ext-profiler/inspector/version.h b/ext-profiler/inspector/version.h new file mode 100644 index 000000000..347757dfc --- /dev/null +++ b/ext-profiler/inspector/version.h @@ -0,0 +1,12 @@ +#ifndef VERSION_H +#define VERSION_H + +#ifdef __cplusplus +extern "C" { +#endif +const char* get_git_version_info(); +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif // VERSION_H diff --git a/ext-tuner/README.md b/ext-tuner/README.md index 67a743a12..7595e03ba 100644 --- a/ext-tuner/README.md +++ b/ext-tuner/README.md @@ -179,4 +179,4 @@ When developing new tuner plugins: - [NCCL Documentation](https://docs.nvidia.com/deeplearning/nccl/) - Example plugin implementations in this directory -For questions and support, refer to the NCCL community resources and documentation. \ No newline at end of file +For questions and support, refer to the NCCL community resources and documentation. diff --git a/ext-tuner/example/.gitignore b/ext-tuner/example/.gitignore new file mode 100644 index 000000000..a3d6f635f --- /dev/null +++ b/ext-tuner/example/.gitignore @@ -0,0 +1,49 @@ +# Compiled shared objects and binaries +*.so +*.o +*.a +*.out +*.exe +*.dll +*.dylib +*.bin +*.elf + +# Python cache +__pycache__/ +*.pyc +*.pyo + +# Build and test artifacts +/build/ +*.log +*.tmp +*.swp + +# Ignore all CSV files except scripts/sample_performance_data.csv +*.csv +!scripts/sample_performance_data.csv + +# Ignore all .conf files except nccl_tuner.conf +*.conf +!nccl_tuner.conf + +my_configs + +# Ignore test binary +test/test_plugin + +# Editor/OS files +.DS_Store +Thumbs.db + +# Backup files +*~ +*.bak + +# Ignore by convention +*.old +*.orig + +# Git +.git/ diff --git a/ext-tuner/example/CMakeLists.txt b/ext-tuner/example/CMakeLists.txt new file mode 100644 index 000000000..1c116b446 --- /dev/null +++ b/ext-tuner/example/CMakeLists.txt @@ -0,0 +1,26 @@ +# Find all C source files in current directory +set(SRC_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/plugin.c +) + +# Create shared library +add_library(nccl-tuner-example SHARED ${SRC_FILES}) + +# Set include directories +target_include_directories(nccl-tuner-example PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/nccl +) + +# Set output name to match Makefile +set_target_properties(nccl-tuner-example PROPERTIES + OUTPUT_NAME "nccl-tuner-example" + PREFIX "lib" + POSITION_INDEPENDENT_CODE ON + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/test/unit/plugins +) + +# Add custom target for clean (equivalent to Makefile clean target) +add_custom_target(clean-tuner-lib + COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/libnccl-tuner-example.so + COMMENT "Cleaning libnccl-tuner-example.so" +) diff --git a/ext-tuner/example/nccl/tuner.h b/ext-tuner/example/nccl/tuner.h index 77b543d12..dc956b1c0 100644 --- a/ext-tuner/example/nccl/tuner.h +++ b/ext-tuner/example/nccl/tuner.h @@ -45,6 +45,40 @@ typedef enum { #define NCCL_ALGO_PROTO_IGNORE -1.0 +#define NCCL_HW_NVLINK 0 +#define NCCL_HW_PCI 1 +#define NCCL_HW_NET 2 +#define NCCL_NUM_HW_LINKS 3 + +#define NCCL_VOLTA_COMPCAP_IDX 0 +#define NCCL_AMPERE_COMPCAP_IDX 1 +#define NCCL_HOPPER_COMPCAP_IDX 2 +#define NCCL_BLACKWELL_COMPCAP_IDX 3 +#define NCCL_NUM_COMPCAPS 4 + +#define NCCL_TUNING_SCALE_1NODE 0 +#define NCCL_TUNING_SCALE_2NODES 1 +#define NCCL_TUNING_SCALE_4NODES 2 +#define NCCL_NUM_TUNING_SCALES 3 + +typedef struct { + int nNvlDomains; // number of NVLink domains + int minRanksPerNvlDomain; // minimum ranks across all NVLink domains + int maxRanksPerNvlDomain; // maximum ranks across all NVLink domains +} ncclNvlDomainInfo_v5_t; + +typedef struct { + double baseLatencies [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + double hwLatencies [NCCL_NUM_HW_LINKS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + + double llMaxBws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]; + double perChMaxRingLL128Bws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]; + double perChMaxTreeLL128Bws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]; + double perChMaxTreeBws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]; + + +} ncclTunerConstants_v5_t; + // API to be implemented by external tuner typedef struct { // Name of the tuner @@ -52,12 +86,17 @@ typedef struct { // Initializes tuner states. // Inputs: + // - commId: communicator identifier // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. // - nNodes: number of nodes in current communicator. // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. + // - nvlDomainInfo: NVL domain information struct // Outputs: // - context: tuner context object - ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context); + // Input/Output: + // - constants: tuner constants + ncclResult_t (*init)(void** ctx, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, + ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_v5_t* constants); // Gets info (algo, protocol, number of ctas and threads) for a given collective. // Inputs: @@ -87,11 +126,13 @@ typedef struct { // Terminates the plugin and cleans up any resources that the plugin allocated. // context: tuner context object - ncclResult_t (*destroy)(void* context); -} ncclTuner_v4_t; + ncclResult_t (*finalize)(void* context); +} ncclTuner_v5_t; -typedef ncclTuner_v4_t ncclTuner_t; +typedef ncclTuner_v5_t ncclTuner_t; +typedef ncclNvlDomainInfo_v5_t ncclNvlDomainInfo_t; +typedef ncclTunerConstants_v5_t ncclTunerConstants_t; -#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4" +#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v5" #endif diff --git a/ext-tuner/example/plugin.c b/ext-tuner/example/plugin.c index 1b8031ed1..af813495a 100644 --- a/ext-tuner/example/plugin.c +++ b/ext-tuner/example/plugin.c @@ -51,6 +51,7 @@ typedef struct { size_t nRanks; size_t nNodes; ncclDebugLogger_t logFunction; + ncclNvlDomainInfo_v5_t nvlDomainInfo; } TunerContext; // Parse collective type from string @@ -289,7 +290,25 @@ static ncclResult_t loadConfig(TunerContext* ctx, const char* filename) { return ncclSuccess; } -__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { +__hidden ncclResult_t pluginInit(void** context, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, + ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_v5_t* constants) { + + if (NULL != constants) { + // NCCL constants tuning + // Tune NCCL's internal tuning model to improve base algo/proto selection. + // Note: Example numbers are for reference only. + // Actual numbers may vary depending on the hardware and network topology. + // These numbers are not guaranteed to be optimal for all cases. + // Limit the tree bandwidth to 15GB/s + constants->perChMaxTreeBws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] = 15.0; + + // Limit the ring bandwidth to 20GB/s + constants->perChMaxRingLL128Bws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] = 20.0; + + // Set NVLSTree base network latency to 24us + constants->hwLatencies[NCCL_HW_NET][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = 24.0; + } + TunerContext* ctx = (TunerContext*)malloc(sizeof(TunerContext)); if (!ctx) return ncclSystemError; @@ -299,10 +318,16 @@ __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t ctx->nRanks = nRanks; ctx->nNodes = nNodes; ctx->logFunction = logFunction; + if (nvlDomainInfo) { + ctx->nvlDomainInfo = *nvlDomainInfo; + } else { + memset(&ctx->nvlDomainInfo, 0, sizeof(ncclNvlDomainInfo_v5_t)); + } if (logFunction) { logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__, - "TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks", nNodes, nRanks); + "TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks, %d NVL domains", + nNodes, nRanks, ctx->nvlDomainInfo.nNvlDomains); } // Try to load config file from environment variable or default location @@ -432,7 +457,7 @@ __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size return ncclSuccess; } -__hidden ncclResult_t pluginDestroy(void* context) { +__hidden ncclResult_t pluginFinalize(void* context) { if (context) { TunerContext* ctx = (TunerContext*)context; if (ctx->configs) { @@ -443,11 +468,12 @@ __hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; } + #define PLUGIN_NAME "Example" -const ncclTuner_v4_t ncclTunerPlugin_v4 = { +const ncclTuner_v5_t ncclTunerPlugin_v5 = { .name = PLUGIN_NAME, .init = pluginInit, .getCollInfo = pluginGetCollInfo, - .destroy = pluginDestroy + .finalize = pluginFinalize }; diff --git a/ext-tuner/example/test/test_plugin.c b/ext-tuner/example/test/test_plugin.c index 28897c449..c0300d51c 100644 --- a/ext-tuner/example/test/test_plugin.c +++ b/ext-tuner/example/test/test_plugin.c @@ -97,12 +97,12 @@ int test_plugin_init() { void* context = NULL; // Test successful initialization - ncclResult_t result = pluginInit(8, 2, mock_logger, &context); + ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL); TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed"); TEST_ASSERT(context != NULL, "Context should be allocated"); // Clean up - pluginDestroy(context); + pluginFinalize(context); TEST_PASS(); } @@ -122,11 +122,11 @@ int test_config_parsing_valid() { setenv("NCCL_TUNER_CONFIG_FILE", "test_valid.conf", 1); void* context = NULL; - ncclResult_t result = pluginInit(16, 2, mock_logger, &context); + ncclResult_t result = pluginInit(&context, 0, 16, 2, mock_logger, NULL, NULL); TEST_ASSERT(result == ncclSuccess, "Plugin init with valid config should succeed"); // Clean up - pluginDestroy(context); + pluginFinalize(context); unlink("test_valid.conf"); unsetenv("NCCL_TUNER_CONFIG_FILE"); TEST_PASS(); @@ -143,12 +143,12 @@ int test_config_parsing_invalid() { setenv("NCCL_TUNER_CONFIG_FILE", "test_invalid.conf", 1); void* context = NULL; - ncclResult_t result = pluginInit(8, 1, mock_logger, &context); + ncclResult_t result = pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL); // Should still succeed but with no valid configs loaded TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed even with invalid config"); // Clean up - pluginDestroy(context); + pluginFinalize(context); unlink("test_invalid.conf"); unsetenv("NCCL_TUNER_CONFIG_FILE"); TEST_PASS(); @@ -164,7 +164,7 @@ int test_collective_matching() { setenv("NCCL_TUNER_CONFIG_FILE", "test_match.conf", 1); void* context = NULL; - pluginInit(8, 1, mock_logger, &context); + pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL); // Create mock cost table float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; @@ -208,7 +208,7 @@ int test_collective_matching() { TEST_ASSERT(nChannels == 4, "Should set 4 channels"); // Clean up - pluginDestroy(context); + pluginFinalize(context); unlink("test_match.conf"); unsetenv("NCCL_TUNER_CONFIG_FILE"); TEST_PASS(); @@ -225,7 +225,7 @@ int test_size_matching() { setenv("NCCL_TUNER_CONFIG_FILE", "test_size.conf", 1); void* context = NULL; - pluginInit(8, 1, mock_logger, &context); + pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL); float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; @@ -279,7 +279,7 @@ int test_size_matching() { TEST_ASSERT(nChannels == 8, "Large: Should set 8 channels"); // Clean up - pluginDestroy(context); + pluginFinalize(context); unlink("test_size.conf"); unsetenv("NCCL_TUNER_CONFIG_FILE"); TEST_PASS(); @@ -297,7 +297,7 @@ int test_topology_matching() { // Test with single node setup void* context1 = NULL; - pluginInit(8, 1, mock_logger, &context1); // 8 ranks, 1 node + pluginInit(&context1, 0, 8, 1, mock_logger, NULL, NULL); // 8 ranks, 1 node float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; @@ -315,11 +315,11 @@ int test_topology_matching() { TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single node: Should match tree config"); TEST_ASSERT(nChannels == 2, "Single node: Should set 2 channels"); - pluginDestroy(context1); + pluginFinalize(context1); // Test with 4 nodes, 32 ranks setup void* context2 = NULL; - pluginInit(32, 4, mock_logger, &context2); // 32 ranks, 4 nodes + pluginInit(&context2, 0, 32, 4, mock_logger, NULL, NULL); // 32 ranks, 4 nodes for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) { for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) { @@ -348,7 +348,7 @@ int test_default_channels() { setenv("NCCL_TUNER_CONFIG_FILE", "test_default.conf", 1); void* context = NULL; - pluginInit(8, 1, mock_logger, &context); + pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL); float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; @@ -368,7 +368,7 @@ int test_default_channels() { TEST_ASSERT(nChannels == 1, "Should keep default channels (1) when config has -1"); // Clean up - pluginDestroy(context); + pluginFinalize(context); unlink("test_default.conf"); unsetenv("NCCL_TUNER_CONFIG_FILE"); TEST_PASS(); @@ -385,7 +385,7 @@ int test_regbuff_matching() { setenv("NCCL_TUNER_CONFIG_FILE", "test_regbuff.conf", 1); void* context = NULL; - pluginInit(8, 1, mock_logger, &context); + pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL); float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; @@ -436,7 +436,7 @@ int test_regbuff_matching() { TEST_ASSERT(nChannels == 8, "Any regBuff: Should set 8 channels"); // Clean up - pluginDestroy(context); + pluginFinalize(context); unlink("test_regbuff.conf"); unsetenv("NCCL_TUNER_CONFIG_FILE"); TEST_PASS(); @@ -453,7 +453,7 @@ int test_pipeops_matching() { setenv("NCCL_TUNER_CONFIG_FILE", "test_pipeops.conf", 1); void* context = NULL; - pluginInit(8, 1, mock_logger, &context); + pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL); float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; @@ -503,7 +503,7 @@ int test_pipeops_matching() { TEST_ASSERT(nChannels == 8, "Any pipeOps: Should set 8 channels"); // Clean up - pluginDestroy(context); + pluginFinalize(context); unlink("test_pipeops.conf"); unsetenv("NCCL_TUNER_CONFIG_FILE"); TEST_PASS(); @@ -518,7 +518,7 @@ int test_no_match_fallback() { setenv("NCCL_TUNER_CONFIG_FILE", "test_fallback.conf", 1); void* context = NULL; - pluginInit(8, 1, mock_logger, &context); + pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL); float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float* cost_table_ptr[NCCL_NUM_ALGORITHMS]; @@ -542,7 +542,7 @@ int test_no_match_fallback() { TEST_ASSERT(nChannels == 1, "Should use default channels"); // Clean up - pluginDestroy(context); + pluginFinalize(context); unlink("test_fallback.conf"); unsetenv("NCCL_TUNER_CONFIG_FILE"); TEST_PASS(); @@ -592,7 +592,7 @@ int test_large_config() { // Initialize plugin with large config void* context = NULL; - ncclResult_t result = pluginInit(16, 4, mock_logger, &context); + ncclResult_t result = pluginInit(&context, 0, 16, 4, mock_logger, NULL, NULL); TEST_ASSERT(result == ncclSuccess, "Plugin init with large config should succeed"); TEST_ASSERT(context != NULL, "Context should be allocated"); @@ -651,7 +651,7 @@ int test_large_config() { TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with large config set"); // Clean up - pluginDestroy(context); + pluginFinalize(context); unlink(large_config_file); unsetenv("NCCL_TUNER_CONFIG_FILE"); @@ -683,7 +683,7 @@ int test_very_large_config_stress() { // Test initialization with stress config void* context = NULL; - ncclResult_t result = pluginInit(8, 2, mock_logger, &context); + ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL); TEST_ASSERT(result == ncclSuccess, "Plugin should handle very large config files"); TunerContext* ctx = (TunerContext*)context; @@ -704,7 +704,7 @@ int test_very_large_config_stress() { } // Clean up - pluginDestroy(context); + pluginFinalize(context); unlink(stress_config_file); unsetenv("NCCL_TUNER_CONFIG_FILE"); @@ -725,7 +725,7 @@ int test_empty_config() { setenv("NCCL_TUNER_CONFIG_FILE", empty_config_file, 1); void* context = NULL; - ncclResult_t result = pluginInit(8, 2, mock_logger, &context); + ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL); TEST_ASSERT(result == ncclSuccess, "Plugin should handle empty config files"); TunerContext* ctx = (TunerContext*)context; @@ -750,13 +750,134 @@ int test_empty_config() { TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with empty config"); // Clean up - pluginDestroy(context); + pluginFinalize(context); unlink(empty_config_file); unsetenv("NCCL_TUNER_CONFIG_FILE"); TEST_PASS(); } +// Test NVLink domain info handling +int test_nvl_domain_info() { + printf("Testing NVLink domain info handling...\n"); + + // Test NVLink domain structure with min/max ranks per domain + ncclNvlDomainInfo_v5_t nvl_domain = { + .nNvlDomains = 2, // 2 nodes = 2 domains + .minRanksPerNvlDomain = 3, // minimum ranks across all domains (bottleneck) + .maxRanksPerNvlDomain = 5 // maximum ranks across all domains (capacity) + }; + + void* context = NULL; + ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, &nvl_domain, NULL); + TEST_ASSERT(result == ncclSuccess, "Plugin init with NVLink domains should succeed"); + + // Validate NVLD info structure + TEST_ASSERT(nvl_domain.nNvlDomains == 2, "Should have 2 domains (nodes)"); + TEST_ASSERT(nvl_domain.minRanksPerNvlDomain == 3, "Should have minimum 3 ranks per domain"); + TEST_ASSERT(nvl_domain.maxRanksPerNvlDomain == 5, "Should have maximum 5 ranks per domain"); + + // Clean up + pluginFinalize(context); + printf("NVLink domain info test passed!\n"); + TEST_PASS(); +} + +int test_tuner_constants() { + // Initialize constants to -1.0 for testing purposes + ncclTunerConstants_v5_t constants = { + // Base latencies: [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] + .baseLatencies = { + {-1.0, -1.0, -1.0}, // NCCL_ALGO_TREE: LL, LL128, Simple + {-1.0, -1.0, -1.0}, // NCCL_ALGO_RING: LL, LL128, Simple + {-1.0, -1.0, -1.0}, // NCCL_ALGO_COLLNET_DIRECT + {-1.0, -1.0, -1.0}, // NCCL_ALGO_COLLNET_CHAIN + {-1.0, -1.0, -1.0}, // NCCL_ALGO_NVLS + {-1.0, -1.0, -1.0}, // NCCL_ALGO_NVLS_TREE + {-1.0, -1.0, -1.0} // NCCL_ALGO_PAT + }, + + // Hardware latencies: [NCCL_NUM_HW_LINKS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] + .hwLatencies = { + // NCCL_HW_NVLINK + { + {-1.0, -1.0, -1.0}, // TREE + {-1.0, -1.0, -1.0}, // RING + {-1.0, -1.0, -1.0}, // COLLNET_DIRECT + {-1.0, -1.0, -1.0}, // COLLNET_CHAIN + {-1.0, -1.0, -1.0}, // NVLS + {-1.0, -1.0, -1.0}, // NVLS_TREE + {-1.0, -1.0, -1.0} // PAT + }, + // NCCL_HW_PCI + { + {-1.0, -1.0, -1.0}, // TREE + {-1.0, -1.0, -1.0}, // RING + {-1.0, -1.0, -1.0}, // COLLNET_DIRECT + {-1.0, -1.0, -1.0}, // COLLNET_CHAIN + {-1.0, -1.0, -1.0}, // NVLS + {-1.0, -1.0, -1.0}, // NVLS_TREE + {-1.0, -1.0, -1.0} // PAT + }, + // NCCL_HW_NET + { + {-1.0, -1.0, -1.0}, // TREE + {-1.0, -1.0, -1.0}, // RING + {-1.0, -1.0, -1.0}, // COLLNET_DIRECT + {-1.0, -1.0, -1.0}, // COLLNET_CHAIN + {-1.0, -1.0, -1.0}, // NVLS + {-1.0, -1.0, -1.0}, // NVLS_TREE + {-1.0, -1.0, -1.0} // PAT + } + }, + + // LL maximum bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES] + .llMaxBws = { + {-1.0, -1.0, -1.0}, // Volta: 1node, 2nodes, 4nodes + {-1.0, -1.0, -1.0}, // Ampere: 1node, 2nodes, 4nodes + {-1.0, -1.0, -1.0}, // Hopper: 1node, 2nodes, 4nodes + {-1.0, -1.0, -1.0} // Blackwell: 1node, 2nodes, 4nodes + }, + + // Per-channel maximum Ring LL128 bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES] + .perChMaxRingLL128Bws = { + {-1.0, -1.0, -1.0}, // Volta: 1node, 2nodes, 4nodes + {-1.0, -1.0, -1.0}, // Ampere: 1node, 2nodes, 4nodes + {-1.0, -1.0, -1.0}, // Hopper: 1node, 2nodes, 4nodes + {-1.0, -1.0, -1.0} // Blackwell: 1node, 2nodes, 4nodes + }, + + // Per-channel maximum Tree LL128 bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES] + .perChMaxTreeLL128Bws = { + {-1.0, -1.0, -1.0}, // Volta: 1node, 2nodes, 4nodes + {-1.0, -1.0, -1.0}, // Ampere: 1node, 2nodes, 4nodes + {-1.0, -1.0, -1.0}, // Hopper: 1node, 2nodes, 4nodes + {-1.0, -1.0, -1.0} // Blackwell: 1node, 2nodes, 4nodes + }, + + // Per-channel maximum Tree bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES] + .perChMaxTreeBws = { + {-1.0, -1.0, -1.0}, // Volta: 1node, 2nodes, 4nodes + {-1.0, -1.0, -1.0}, // Ampere: 1node, 2nodes, 4nodes + {-1.0, -1.0, -1.0}, // Hopper: 1node, 2nodes, 4nodes + {-1.0, -1.0, -1.0} // Blackwell: 1node, 2nodes, 4nodes + } + }; + + void* context = NULL; + ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, &constants); + TEST_ASSERT(result == ncclSuccess, "Plugin init with constants should succeed"); + + // Test that the constants were set correctly + TEST_ASSERT(constants.perChMaxTreeBws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] == 15.0, "Tree bandwidth should be 15GB/s"); + TEST_ASSERT(constants.perChMaxRingLL128Bws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] == 20.0, "Ring bandwidth should be 20GB/s"); + TEST_ASSERT(constants.hwLatencies[NCCL_HW_NET][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] == 24.0, "NVLSTree base network latency should be 24us"); + + // Clean up + pluginFinalize(context); + TEST_PASS(); +} + // Test runner function pointer type typedef int (*TestFunction)(void); @@ -782,6 +903,8 @@ TestCase test_cases[] = { {"large-config", test_large_config, "Large configuration files (dynamic allocation)"}, {"stress-config", test_very_large_config_stress, "Very large configuration stress test"}, {"empty-config", test_empty_config, "Empty configuration file handling"}, + {"nvl-domain", test_nvl_domain_info, "NVL domain info handling"}, + {"constants", test_tuner_constants, "Tuner constants initialization"}, {NULL, NULL, NULL} // End marker }; @@ -825,6 +948,7 @@ int main(int argc, char* argv[]) { if (argc == 1) { // No arguments - run all tests for (int i = 0; test_cases[i].name != NULL; i++) { + printf("Running test: %s\n", test_cases[i].name); total++; passed += test_cases[i].func(); } diff --git a/makefiles/common.mk b/makefiles/common.mk index 0f01671b6..f8f455dec 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -32,13 +32,8 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2) # You should define NVCC_GENCODE in your environment to the minimal set # of archs to reduce compile time. -CUDA8_GENCODE = -gencode=arch=compute_50,code=sm_50 \ - -gencode=arch=compute_60,code=sm_60 \ +CUDA8_GENCODE = -gencode=arch=compute_60,code=sm_60 \ -gencode=arch=compute_61,code=sm_61 -ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0) -# SM35 is deprecated from CUDA12.0 onwards -CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35 -endif CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70 CUDA10_GENCODE = -gencode=arch=compute_75,code=sm_75 CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80 diff --git a/makefiles/version.mk b/makefiles/version.mk index 3b182d61b..d0e97c065 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 27 -NCCL_PATCH := 7 +NCCL_MINOR := 28 +NCCL_PATCH := 3 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/pkg/Makefile b/pkg/Makefile index ab6487be9..cffd5d76f 100644 --- a/pkg/Makefile +++ b/pkg/Makefile @@ -10,7 +10,7 @@ build : debian.build txz.build BUILDDIR ?= $(abspath ../build) ABSBUILDDIR := $(abspath $(BUILDDIR)) -TARGETS := debian txz +TARGETS := debian txz doc all: ${TARGETS:%=%.build} prep: ${TARGETS:%=%.prep} build: ${TARGETS:%=%.build} diff --git a/pkg/debian/libnccl-dev.install.in b/pkg/debian/libnccl-dev.install.in index 45120e6de..b656e63ab 100644 --- a/pkg/debian/libnccl-dev.install.in +++ b/pkg/debian/libnccl-dev.install.in @@ -1,4 +1,4 @@ bin/ncclras /usr/bin -include/nccl.h /usr/include +include/* /usr/include lib/libnccl.so /usr/lib/${pkg:MultiArch} lib/libnccl_static.a /usr/lib/${pkg:MultiArch} diff --git a/pkg/redhat/nccl.spec.in b/pkg/redhat/nccl.spec.in index d62955592..30ddbf19f 100644 --- a/pkg/redhat/nccl.spec.in +++ b/pkg/redhat/nccl.spec.in @@ -47,8 +47,8 @@ ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_li # devel install -m 755 -d $RPM_BUILD_ROOT/%{_bindir} install -m 755 -d $RPM_BUILD_ROOT/%{_includedir} +cp -a include/* $RPM_BUILD_ROOT/%{_includedir}/ install -m 755 bin/ncclras $RPM_BUILD_ROOT/%{_bindir} -install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir} ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so # static @@ -67,7 +67,7 @@ rm -rf $RPM_BUILD_ROOT %doc LICENSE.txt %defattr(-,root,root,-) %{_bindir}/ncclras -%{_includedir}/nccl.h +%{_includedir}/* %{_libdir}/libnccl.so %files static diff --git a/pkg/srctxz/Makefile b/pkg/srctxz/Makefile index 01cab95a4..a8d9e0da9 100644 --- a/pkg/srctxz/Makefile +++ b/pkg/srctxz/Makefile @@ -22,7 +22,7 @@ prep: $(TXZTARGETS) build: prep $(MAKE) -C ../../src clean @printf "Building source tar.xz package\n" - (cd $(BUILDDIR); bash srctxz/create_srctxz.sh) + (cd $(BUILDDIR); SRCTXZ_APITESTS=$(SRCTXZ_APITESTS) bash srctxz/create_srctxz.sh) mkdir -p $(PKGDIR) mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR) diff --git a/pkg/srctxz/create_srctxz.sh.in b/pkg/srctxz/create_srctxz.sh.in index 11bdd52db..0e627dd25 100644 --- a/pkg/srctxz/create_srctxz.sh.in +++ b/pkg/srctxz/create_srctxz.sh.in @@ -28,8 +28,34 @@ NCCL_SUFFIX=${nccl:Suffix} NCCL_BUILD=${pkg:Revision} NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}" +if [ "${SRCTXZ_APITESTS}" = "1" ]; then + NCCLNAME+="-apitest" +fi + + +INCLUDE_TEST_ENTRIES=("apitest" "googletest" "gtest.mk") + +if [ "${SRCTXZ_APITESTS}" = "1" ]; then + # Exclude all entries inside test folder except those in INCLUDE_TEST_ENTRIES + for entry in $(ls $NCCLDIR/test); do + if [[ ! " ${INCLUDE_TEST_ENTRIES[@]} " =~ " $entry " ]]; then + EXCLUDE_TEST+=" --exclude $NCCLDIR/test/$entry" + fi + done +else + # Exclude the entire test directory + EXCLUDE_TEST+=" --exclude test" +fi -tar --exclude build \ +tar --exclude fortran \ + --exclude doc \ + --exclude plc \ + --exclude build \ --exclude ".git*" \ + --exclude share \ + --exclude ompi \ + --exclude ext-net \ --exclude pkg/srctxz \ + --exclude docker \ + $EXCLUDE_TEST \ --transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 000000000..5ab69dc92 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,180 @@ +# Source files +set(LIBSRCFILES + bootstrap.cc + channel.cc + ce_coll.cc + collectives.cc + debug.cc + enqueue.cc + group.cc + init.cc + init_nvtx.cc + proxy.cc + transport.cc + mnnvl.cc + allocator.cc + sym_kernels.cc + dev_runtime.cc +) + +# Add compatibility shim if using static cudart +if(CUDARTLIB STREQUAL "cudart_static") + list(APPEND LIBSRCFILES enhcompat.cc) +endif() + +# Configure pkg-config file +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/nccl.pc.in + ${CMAKE_BINARY_DIR}/lib/pkgconfig/nccl.pc + @ONLY +) + +# Add files from subdirectories +add_subdirectory(transport) +add_subdirectory(misc) +add_subdirectory(register) +add_subdirectory(graph) +add_subdirectory(plugin) +add_subdirectory(device) +add_subdirectory(nccl_device) +add_subdirectory(ras) +add_subdirectory(scheduler) + +add_compile_options(-fmacro-prefix-map=${CMAKE_CURRENT_SOURCE_DIR}/=) + +# Add all source files +list(APPEND LIBSRCFILES + ${TRANSPORT_SOURCES} + ${MISC_SOURCES} + ${REGISTER_SOURCES} + ${GRAPH_SOURCES} + ${PLUGIN_SOURCES} + ${RAS_SOURCES} + ${SYM_SOURCES} + ${SCHEDULER_SOURCES} +) + +###################### Create a shared NCCL library ############################ +add_library(nccl SHARED) + +target_sources(nccl PRIVATE ${LIBSRCFILES}) + +# Include directories +target_include_directories(nccl PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/device + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${CMAKE_CURRENT_SOURCE_DIR}/include/plugin + ${CUDAToolkit_INCLUDE_DIRS} + ${CUDAToolkit_INCLUDE_DIRS}/cccl +) + +add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/include/nccl.h + COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/include + COMMAND sed -e "s/\\\$$\\{nccl:Major\\}/${NCCL_MAJOR}/g" + -e "s/\\\$$\\{nccl:Minor\\}/${NCCL_MINOR}/g" + -e "s/\\\$$\\{nccl:Patch\\}/${NCCL_PATCH}/g" + -e "s/\\\$$\\{nccl:Suffix\\}/${NCCL_SUFFIX}/g" + -e "s/\\\$$\\{nccl:Version\\}/${NCCL_VERSION_CODE}/g" + ${CMAKE_CURRENT_SOURCE_DIR}/nccl.h.in > ${CMAKE_BINARY_DIR}/include/nccl.h + BYPRODUCTS ${CMAKE_BINARY_DIR}/include/nccl.h +) + +add_custom_target(nccl_header DEPENDS ${CMAKE_BINARY_DIR}/include/nccl.h) + +add_dependencies(nccl nccl_header) + +# Set version and output name +set_target_properties(nccl PROPERTIES + VERSION ${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH} + SOVERSION ${NCCL_MAJOR} + OUTPUT_NAME "nccl" + PREFIX "lib" +) + +# Set CUDA specific flags +set_target_properties(nccl PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON + CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}" + POSITION_INDEPENDENT_CODE ON +) + +# Link libraries +target_link_libraries(nccl + PRIVATE + nccl_device + pthread + rt + dl + ${CUDAToolkit_LIBRARIES} + ${EXTRA_LIBS} +) + +# Set output directories for nccl shared library +set_target_properties(nccl PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" +) + +###################### Create a ras binary executable ############################ +set(RAS_BINSRCFILES ras/client.cc) + +add_executable(ncclras ${RAS_BINSRCFILES}) + +target_include_directories(ncclras PUBLIC + ${CMAKE_BINARY_DIR}/include + ${CUDAToolkit_INCLUDE_DIRS} +) + +add_dependencies(ncclras nccl_header) + +target_link_libraries(ncclras + PRIVATE + pthread + rt + dl +) + +# Set output directory for ncclras executable +set_target_properties(ncclras PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin" +) + +###################### Create a static NCCL library ############################ +add_library(nccl_static STATIC ${LIBSRCFILES}) + +# Include directories +target_include_directories(nccl_static PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/device + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${CMAKE_CURRENT_SOURCE_DIR}/include/plugin + ${CUDAToolkit_INCLUDE_DIRS} + ${CUDAToolkit_INCLUDE_DIRS}/cccl +) + +# Add dependency on nccl_header +add_dependencies(nccl_static nccl_header) + +# Link libraries +target_link_libraries(nccl_static + PRIVATE + nccl_device + pthread + rt + dl + ${CUDAToolkit_LIBRARIES} + ${EXTRA_LIBS} +) + +# Set CUDA specific flags +set_target_properties(nccl_static PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON + CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}" + POSITION_INDEPENDENT_CODE ON +) + +# Set output directory for nccl_static library +set_target_properties(nccl_static PROPERTIES + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" +) diff --git a/src/Makefile b/src/Makefile index eab662ef9..be026cc26 100644 --- a/src/Makefile +++ b/src/Makefile @@ -7,10 +7,12 @@ include ../makefiles/common.mk include ../makefiles/version.mk ##### src files -INCEXPORTS := nccl.h +INCEXPORTS := nccl.h nccl_device.h \ + $(patsubst include/%,%,$(wildcard include/nccl_device/*.h include/nccl_device/impl/*.h)) + LIBSRCFILES := \ bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \ - init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc symmetric.cc \ + init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc dev_runtime.cc sym_kernels.cc ce_coll.cc \ $(wildcard graph/*.cc) \ $(wildcard misc/*.cc) \ $(wildcard transport/*.cc) \ @@ -19,6 +21,8 @@ LIBSRCFILES := \ $(wildcard plugin/net/*.cc) \ $(wildcard plugin/tuner/*.cc) \ $(wildcard plugin/profiler/*.cc) \ + $(wildcard nccl_device/*.cc) \ + $(wildcard scheduler/*.cc) \ $(filter-out ras/client.cc,$(wildcard ras/*.cc)) BINSRCFILES := ras/client.cc @@ -123,6 +127,16 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h mkdir -p $(INCDIR) install -m 644 $< $@ +$(INCDIR)/nccl_device/%.h: include/nccl_device/%.h + @printf "Grabbing %-35s > %s\n" $< $@ + mkdir -p $(INCDIR)/nccl_device + install -m 644 $< $@ + +$(INCDIR)/nccl_device/impl/%.h: include/nccl_device/impl/%.h + @printf "Grabbing %-35s > %s\n" $< $@ + mkdir -p $(INCDIR)/nccl_device/impl + install -m 644 $< $@ + $(PKGDIR)/%.pc : %.pc @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(PKGDIR) @@ -149,7 +163,7 @@ install : build mkdir -p $(PREFIX)/bin cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/ cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/ - cp -v $(BUILDDIR)/include/* $(PREFIX)/include/ + cp -v -r $(BUILDDIR)/include/* $(PREFIX)/include/ cp -v $(BUILDDIR)/bin/ncclras $(PREFIX)/bin/ FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|gdrwrap.h|nccl.h') diff --git a/src/allocator.cc b/src/allocator.cc index c58181948..f5638b92d 100644 --- a/src/allocator.cc +++ b/src/allocator.cc @@ -7,10 +7,11 @@ #include "comm.h" #include "transport.h" #include "group.h" +#include "nvtx.h" NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size); ncclResult_t ncclMemAlloc(void **ptr, size_t size) { - NVTX3_FUNC_RANGE_IN(nccl_domain); + NCCL_NVTX3_FUNC_RANGE; ncclResult_t ret = ncclSuccess; #if CUDART_VERSION >= 12010 @@ -98,7 +99,7 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) { NCCL_API(ncclResult_t, ncclMemFree, void *ptr); ncclResult_t ncclMemFree(void *ptr) { - NVTX3_FUNC_RANGE_IN(nccl_domain); + NCCL_NVTX3_FUNC_RANGE; ncclResult_t ret = ncclSuccess; int saveDevice; @@ -127,70 +128,339 @@ ncclResult_t ncclMemFree(void *ptr) { goto exit; } -// This is a collective function and should be called by all ranks in the communicator -ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr) { - ncclResult_t ret = ncclSuccess; - void* regSymAddr = NULL; - size_t allocSize = size; - size_t granularity; - CUdevice cuDev; - CUmemAllocationProp memprop = {}; - CUmemGenericAllocationHandle memHandle; - int bit = 0, cnt = 0; - - // aligment must be power of 2 as an input - while (bit < sizeof(size_t) * 8) { - if (alignment & (1L << bit)) cnt++; - if (cnt == 2) { - WARN("rank %d alignment %ld is not power of 2", comm->rank, alignment); - goto fail; +//////////////////////////////////////////////////////////////////////////////// +// ncclSpace: +// +// This datastructure "cuts" the line of non-negative integers into segments +// which alternate between "full" (allocated) and "empty" (not allocated). The +// cuts are sorted ascending. The segment after the last cut must be empty +// (the unallocated frontier). Knwoing this we can deduce whether the segment +// ending at cut[i] is full or empty with this formula: +// isFull(i) = (i%2 != ncuts%2) + +void ncclSpaceConstruct(struct ncclSpace* a) { + memset(a, 0, sizeof(*a)); +} + +void ncclSpaceDestruct(struct ncclSpace* a) { + free(a->cuts); +} + +static void insertSegment(struct ncclSpace* a, int index, int64_t lo, int64_t hi) { + // Insert space for two cuts in `a->cuts[]` before `index`. + if (a->count + 2 > a->capacity) { + a->capacity *= 2; + if (a->capacity == 0) a->capacity = 16; + int64_t* cuts1 = (int64_t*)malloc(a->capacity*sizeof(int64_t)); + for (int i=0; i < index; i++) cuts1[i] = a->cuts[i]; + for (int i=index; i < a->count; i++) cuts1[i+2] = a->cuts[i]; + free(a->cuts); + a->cuts = cuts1; + } else { + for (int i=a->count-1; index <= i; i--) a->cuts[i+2] = a->cuts[i]; + } + a->cuts[index+0] = lo; + a->cuts[index+1] = hi; + a->count += 2; + + // Filter pairs of adjacent repeated values from cuts[]. Since these mark + // boundaries where segments transition between full<->empty, dropping such a + // pair fuses two adjacent segments together. Examples: + // [1,2,3,3,4] -> [1,2,4] + // [1,2,3,3,3,4] -> [1,2,3,4] // have to leave one 3 because its a full<->empty transition + // [1,2,3,3,3,3,4] -> [1,2,4] + // Leading zeros don't have to be in pairs, they are always dropped: + // [0,1,2] -> [1,2] + // [0,0,1,2] -> [1,2] + int r = index, w = index; // Read and write cursors. + int64_t prev = r==0 ? 0 : a->cuts[r-1]; + while (r < a->count) { + int64_t cur = a->cuts[r++]; + a->cuts[w++] = cur; + if (prev == cur) { // Repeated value is an empty segment which can be deleted. + // Erase last two cuts or just one if we're at the start. + w -= w==1 ? 1 : 2; + // Zeros can only occur at the beginning (due to being sorted). We want to + // drop any number of zeros, but only even numbers of other repeated values. + // So set to zero here, which will make prev=0, thus if next value is zero + // it will be dropped but if its not zero then it will need to begin a new + // pair to be dropped. + cur = 0; } - bit++; - } - // temporarily align the alignment to NCCL_REC_PAGE_SIZE - ALIGN_SIZE(alignment, NCCL_REC_PAGE_SIZE); - - CUCHECKGOTO(cuDeviceGet(&cuDev, comm->cudaDev), ret, fail); - memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - memprop.requestedHandleTypes = ncclCuMemHandleType; - memprop.location.id = cuDev; - CUCHECKGOTO(cuMemGetAllocationGranularity(&granularity, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail); - ALIGN_SIZE(allocSize, granularity); - - CUCHECKGOTO(cuMemCreate(&memHandle, allocSize, &memprop, 0), ret, fail); - ALIGN_SIZE(comm->symAllocHead, alignment); - NCCLCHECKGOTO(ncclIpcSymmetricMap(comm, comm->symAllocHead, allocSize, memHandle, ®SymAddr), ret, fail); - NCCLCHECKGOTO(ncclNvlsSymmetricMap(comm, comm->symAllocHead, allocSize, regSymAddr), ret, fail); - NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail); - comm->symAllocHead += allocSize; - *symPtr = regSymAddr; + prev = cur; + } + a->count = w; +} -exit: - return ret; -fail: - *symPtr = NULL; - goto exit; +ncclResult_t ncclSpaceAlloc( + struct ncclSpace* a, int64_t limit, int64_t size, int align, + int64_t* outOffset + ) { + // When allocating we try to locate the first empty segment which can hold + // the allocation and move its lower cut upward. + int i = a->count%2; // First empty segment ends at cuts[i] + size_t off; + while (i <= a->count) { + size_t lo = i == 0 ? 0 : a->cuts[i-1]; + size_t hi = i == a->count ? limit : a->cuts[i]; + off = alignUp(lo, align); + if (off + size <= hi) { + *outOffset = off; + if (i == 0 || off + size == hi) { // Slow path required. + insertSegment(a, i, off, off+size); + } else { // We can just append to the end of a full segment. + a->cuts[i-1] = off + size; + } + return ncclSuccess; + } + i += 2; // Next empty segment + } + WARN("Allocation failed. No suitable space found to accommodate size=0x%lx within limit=0x%lx", (long)size, (long)limit); + return ncclInternalError; } -ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr) { - CUmemGenericAllocationHandle handle; - size_t size = 0; - ncclResult_t ret = ncclSuccess; - int saveDev = comm->cudaDev; - CUDACHECKGOTO(cudaGetDevice(&saveDev), ret, fail); - if (ncclCuMemEnable()) { - CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail); - CUCHECKGOTO(cuMemRetainAllocationHandle(&handle, symPtr), ret, fail); - CUCHECKGOTO(cuMemRelease(handle), ret, fail); - CUCHECKGOTO(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)symPtr), ret, fail); - NCCLCHECKGOTO(ncclNvlsSymmetricFree(comm, size, symPtr), ret, fail); - NCCLCHECKGOTO(ncclIpcSymmetricFree(comm, size, symPtr), ret, fail); - CUCHECKGOTO(cuMemRelease(handle), ret, fail); +ncclResult_t ncclSpaceFree(struct ncclSpace* a, int64_t offset, int64_t size) { + if (a->count == 0 || a->cuts[a->count-1] <= offset) { + WARN("No allocation found at offset=0x%lx", (long)offset); + return ncclInternalError; } -exit: - CUDACHECK(cudaSetDevice(saveDev)); - return ret; -fail: - goto exit; + + // This could be binary search, but since allocate is linear there's no point. + int i = 1 - a->count%2; // First full segment ends at cuts[i] + while (a->cuts[i] <= offset) i += 2; + + int64_t lo = i==0 ? 0 : a->cuts[i-1]; + int64_t hi = a->cuts[i]; + + if (offset < lo || hi < offset + size) { + WARN("Given size=0x%lx extends beyond allocation.", (long)size); + return ncclInternalError; + } + + // First try the two fast cases which just shrink a segment from one side. + if (i != 0 && lo == offset && offset + size != hi) { + a->cuts[i-1] = offset + size; // Bring bottom up. + } else if (lo != offset && offset + size == hi) { + a->cuts[i] = offset; // Bring top down. + } else { // Slow path. + insertSegment(a, i, offset, offset+size); + } + return ncclSuccess; +} + +//////////////////////////////////////////////////////////////////////////////// +// ncclShadowPool: + +struct ncclShadowPage { // A contiguous block of (at most) 64 objects + struct ncclShadowPage* next; + int objSize; + uint64_t freeMask; + void* devObjs; +}; +struct ncclShadowObject { + struct ncclShadowObject* next; + void* devObj; + void* hostObj; + struct ncclShadowPage* page; // null if not allocated in page but directly in CUDA mempool. +}; + +void ncclShadowPoolConstruct(struct ncclShadowPool* pool) { + pool->hbits = 0; + pool->count = 0; + pool->table = nullptr; + pool->pages = nullptr; +} + +ncclResult_t ncclShadowPoolDestruct(struct ncclShadowPool* pool) { + if (pool->hbits != 0) { + cudaStream_t stream; + CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + if (pool->count != 0) { + for (int i=0; i < 1<hbits; i++) { + struct ncclShadowObject* obj = pool->table[i]; + while (obj != nullptr) { + struct ncclShadowPage* page = obj->page; + if (page != nullptr) { + if (page->freeMask == 0) { // Put full pages back into page list. + page->freeMask = 1; + page->next = pool->pages; + pool->pages = page; + } + } else { + cudaFreeAsync(obj->devObj, stream); + } + struct ncclShadowObject* next = obj->next; + free(obj); + obj = next; + } + } + } + free(pool->table); + + while (pool->pages != nullptr) { + cudaFreeAsync(pool->pages->devObjs, stream); + struct ncclShadowPage* next = pool->pages->next; + free(pool->pages); + pool->pages = next; + } + + cudaStreamSynchronize(stream); + cudaStreamDestroy(stream); + cudaMemPoolDestroy(pool->memPool); + } + return ncclSuccess; +} + +static int hashBucket(int hbits, void* devObj) { + uintptr_t h = reinterpret_cast(devObj); + h ^= h>>32; + h *= 0x9e3779b97f4a7c13; + return (uint64_t)h >> (64-hbits); +} + +static void hashInsert(struct ncclShadowPool* pool, struct ncclShadowObject* obj) { + int b = hashBucket(pool->hbits, obj->devObj); + obj->next = pool->table[b]; + pool->table[b] = obj; +} + +ncclResult_t ncclShadowPoolAlloc( + struct ncclShadowPool* pool, size_t size, void** outDevObj, void** outHostObj, + cudaStream_t stream + ) { + if (size == 0) { + if (outDevObj) *outDevObj = nullptr; + if (outHostObj) *outHostObj = nullptr; + return ncclSuccess; + } + + int hbits = pool->hbits; + if (hbits == 0) { + cudaMemPoolProps props = {}; + props.allocType = cudaMemAllocationTypePinned; + props.handleTypes = cudaMemHandleTypeNone; + props.location.type = cudaMemLocationTypeDevice; + cudaGetDevice(&props.location.id); + CUDACHECK(cudaMemPoolCreate(&pool->memPool, &props)); + + pool->hbits = hbits = 4; + pool->table = (struct ncclShadowObject**)malloc(sizeof(struct ncclShadowObject*)<table[i] = nullptr; + } + + // Check for hash table size increase before inserting. Maintain 2:1 object:bucket ratio. + if (pool->count+1 > 2<table; + struct ncclShadowObject** table1 = (struct ncclShadowObject**)malloc(sizeof(struct ncclShadowObject*)<<(hbits+1)); + pool->table = table1; + pool->hbits = hbits+1; + for (int i1=0; i1 < 2<next; + hashInsert(pool, obj); + obj = next; + } + } + hbits += 1; // match pool->hbits + free(table0); + } + + struct ncclShadowPage* page; + void *devObj; + if ((64<<10)/size >= 3) { + int shift = std::max(0, (int)log2Down(size) + 1 - 4); + int pageObjSize = ((size + (1<>shift)<pages; + while (true) { + page = *pagePtr; + if (page == nullptr) { + size_t pageSize = std::min(64<<10, 64*pageObjSize); + page = (struct ncclShadowPage*)malloc(sizeof(struct ncclShadowPage)); + page->objSize = pageObjSize; + page->freeMask = uint64_t(-1)>>(64 - pageSize/pageObjSize); + page->next = pool->pages; + pool->pages = page; + CUDACHECK(cudaMallocFromPoolAsync(&page->devObjs, pageSize, pool->memPool, stream)); + CUDACHECK(cudaMemsetAsync(page->devObjs, 0, pageSize, stream)); + // fall through... + } + if (page->objSize == pageObjSize) { + int slot = popFirstOneBit(&page->freeMask); + devObj = (char*)page->devObjs + slot*pageObjSize; + if (page->freeMask == 0) *pagePtr = page->next; // Remove full page from list. + break; + } + pagePtr = &page->next; + } + } else { + page = nullptr; + CUDACHECK(cudaMallocFromPoolAsync(&devObj, size, pool->memPool, stream)); + CUDACHECK(cudaMemsetAsync(devObj, 0, size, stream)); + } + + struct ncclShadowObject* obj = (struct ncclShadowObject*)malloc( + sizeof(struct ncclShadowObject) + /*padding=*/alignof(max_align_t)-1 + size + ); + obj->page = page; + obj->devObj = devObj; + obj->hostObj = alignUp((char*)(obj+1), alignof(max_align_t)); + memset(obj->hostObj, 0, size); + hashInsert(pool, obj); + pool->count += 1; + if (outDevObj) *outDevObj = devObj; + if (outHostObj) *outHostObj = obj->hostObj; + return ncclSuccess; +} + +ncclResult_t ncclShadowPoolFree(struct ncclShadowPool* pool, void* devObj, cudaStream_t stream) { + if (devObj == nullptr) return ncclSuccess; + + int b = hashBucket(pool->hbits, devObj); + struct ncclShadowObject** pobj = &pool->table[b]; + while (true) { + if (*pobj == nullptr) { + WARN("Device object does not exist in shadow pool."); + return ncclInternalError; + } + if ((*pobj)->devObj == devObj) break; + pobj = &(*pobj)->next; + } + struct ncclShadowObject* obj = *pobj; + *pobj = obj->next; + if (obj->page != nullptr) { + if (obj->page->freeMask == 0) { + obj->page->next = pool->pages; + pool->pages = obj->page; + } + int slot = ((char*)obj->devObj - (char*)obj->page->devObjs)/obj->page->objSize; + obj->page->freeMask |= uint64_t(1)<count -= 1; + return ncclSuccess; +} + +ncclResult_t ncclShadowPoolToHost(struct ncclShadowPool* pool, void* devObj, void** hostObj) { + if (devObj == nullptr) { + *hostObj = nullptr; + return ncclSuccess; + } + + int b = hashBucket(pool->hbits, devObj); + struct ncclShadowObject* obj = pool->table[b]; + while (true) { + if (obj == nullptr) { + WARN("Device object does not exist in shadow pool."); + return ncclInternalError; + } + if (obj->devObj == devObj) break; + obj = obj->next; + } + *hostObj = obj->hostObj; + return ncclSuccess; } diff --git a/src/bootstrap.cc b/src/bootstrap.cc index f05337249..7615b9c52 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -14,6 +14,7 @@ #include "proxy.h" #include "param.h" #include "ras.h" +#include #define BOOTSTRAP_N_CHECK_ABORT 10000 #define BOOTSTRAP_TAG_CONNECT (0x1 << 31) @@ -85,13 +86,13 @@ struct bootstrapRootArgs { static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1]; static union ncclSocketAddress bootstrapNetIfAddr; static int bootstrapNetInitDone = 0; -pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER; +static std::mutex bootstrapNetMutex; NCCL_PARAM(BootstrapNetEnable,"OOB_NET_ENABLE", 0); ncclResult_t bootstrapNetInit() { if (bootstrapNetInitDone == 0) { - pthread_mutex_lock(&bootstrapNetLock); + std::lock_guard lock(bootstrapNetMutex); if (bootstrapNetInitDone == 0) { const char* env = ncclGetEnv("NCCL_COMM_ID"); int nIfs = 0; @@ -99,21 +100,18 @@ ncclResult_t bootstrapNetInit() { union ncclSocketAddress remoteAddr; if (ncclSocketGetAddrFromString(&remoteAddr, env) != ncclSuccess) { WARN("Invalid NCCL_COMM_ID, please use format: : or []: or :"); - pthread_mutex_unlock(&bootstrapNetLock); return ncclInvalidArgument; } NCCLCHECK(ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, &nIfs)); if (nIfs <= 0) { WARN("NET/Socket : No usable listening interface found"); - pthread_mutex_unlock(&bootstrapNetLock); return ncclSystemError; } } else { NCCLCHECK(ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1, &nIfs)); if (nIfs <= 0) { WARN("Bootstrap : no socket interface found"); - pthread_mutex_unlock(&bootstrapNetLock); return ncclInvalidUsage; } } @@ -123,7 +121,6 @@ ncclResult_t bootstrapNetInit() { INFO(NCCL_BOOTSTRAP, "Bootstrap: Using%s", line); bootstrapNetInitDone = 1; } - pthread_mutex_unlock(&bootstrapNetLock); } return ncclSuccess; } @@ -485,7 +482,7 @@ static ncclResult_t getUDS(uint64_t* peerUDS) { static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) { static int devOOB = -1; if (devOOB < 0) { - pthread_mutex_lock(&bootstrapNetLock); + std::lock_guard lock(bootstrapNetMutex); if (devOOB < 0) { const char* userIfEnv = ncclGetEnv("NCCL_OOB_NET_IFNAME"); if (userIfEnv && strlen(userIfEnv) > 0) { @@ -516,7 +513,6 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) { WARN("no device found matching %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv); else WARN("no device found after excluding %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv); - pthread_mutex_unlock(&bootstrapNetLock); return ncclInvalidArgument; } } else { @@ -529,13 +525,12 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) { bool hasProp = res == ncclSuccess; INFO(NCCL_BOOTSTRAP, "Bootstrap: Using %s:%d", (hasProp) ? props.name : "N/A", (hasProp) ? props.port : -1); } - pthread_mutex_unlock(&bootstrapNetLock); } *dev = devOOB; return ncclSuccess; } -static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* listen, char peerHandle[NCCL_NET_HANDLE_MAXSIZE], +static ncclResult_t netRingConnect(void* ctx, ncclNet_t* net, struct bootstrapListen_t* listen, char peerHandle[NCCL_NET_HANDLE_MAXSIZE], void** sendComm, ncclNetDeviceHandle_t** sendDevHandle, void** recvComm, ncclNetDeviceHandle_t** recvDevHandle, volatile uint32_t* abortFlag) { @@ -543,7 +538,7 @@ static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* lis do { NCCLCHECK(checkAbort(abortFlag, &abortCounter)); if (!*sendComm) - NCCLCHECK(net->connect(listen->net.dev, NULL, peerHandle, sendComm, sendDevHandle)); + NCCLCHECK(net->connect(ctx, listen->net.dev, peerHandle, sendComm, sendDevHandle)); if (!*recvComm) NCCLCHECK(net->accept(listen->net.comm, recvComm, recvDevHandle)); } while (!*sendComm || !*recvComm); @@ -655,7 +650,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) { if (ncclParamBootstrapNetEnable()) { // Create net interface for other ranks to contact me (all gather) NCCLCHECK(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev))); - NCCLCHECK(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm))); + NCCLCHECK(state->net->listen(comm->netContext, STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm))); memcpy(info.connectInfo.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE); } else { // create socket for ring neightbor to contact mee @@ -709,7 +704,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) { // accept and connect the ring network if (ncclParamBootstrapNetEnable()) { - NCCLCHECK(netRingConnect(state->net, &state->listen, nextPeer.handle, + NCCLCHECK(netRingConnect(comm->netContext, state->net, &state->listen, nextPeer.handle, &STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle), &STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag)); } else { @@ -802,7 +797,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo // create a handle for the others to reach out to me if (ncclParamBootstrapNetEnable()) { NCCLCHECKGOTO(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)), ret, fail); - NCCLCHECKGOTO(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)), ret, fail); + NCCLCHECKGOTO(state->net->listen(comm->netContext, STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)), ret, fail); memcpy(info.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE); } else { // create socket for ring neightbor to contact mee @@ -821,7 +816,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, BOOTSTRAP_TAG_COMMSPLIT, &info, sizeof(union ringConnectInfo)), ret, fail); NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, BOOTSTRAP_TAG_COMMSPLIT, &nextPeer, sizeof(union ringConnectInfo)), ret, fail); if (ncclParamBootstrapNetEnable()) { - NCCLCHECKGOTO(netRingConnect(state->net, &state->listen, nextPeer.handle, + NCCLCHECKGOTO(netRingConnect(comm->netContext, state->net, &state->listen, nextPeer.handle, &STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle), &STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag), ret, fail); diff --git a/src/ce_coll.cc b/src/ce_coll.cc new file mode 100644 index 000000000..3f3dcbd7f --- /dev/null +++ b/src/ce_coll.cc @@ -0,0 +1,615 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "comm.h" +#include "register_inline.h" +#include +#include "cudawrap.h" +#include "ce_coll.h" +#include "alloc.h" + +// Static constant for graph synchronization +static const uint32_t GRAPH_SYNC_VALUE = 1; + +// Static constants for intra-batch synchronization to improve CE collective performance with large scale +// Frequency of intra-batch synchronization +static const uint32_t CE_COLL_INTRA_BATCH_SYNC_FREQ = 8; +// Message threshold for intra-batch synchronization +static const uint64_t CE_COLL_INTRA_BATCH_SYNC_MSG_THRESHOLD = 512*1024*1024; + +ncclResult_t ncclCeInit(struct ncclComm* comm) { + ncclResult_t ret = ncclSuccess; + + uint8_t* ceDevBase; + size_t ceDevBaseSize = alignUp(comm->nRanks*sizeof(uint32_t), 16) * 2; + ncclWindow_vidmem* ceWinDev; + ncclWindow_vidmem* ceWinDevHost; + + // Ensure symmetric memory runtime is initialized + NCCLCHECKGOTO(ncclDevrInitOnce(comm), ret, fail); + // Allocate and register memory for the symmetric memory + NCCLCHECKGOTO(ncclMemAlloc((void**)&ceDevBase, ceDevBaseSize), ret, fail); + NCCLCHECKGOTO(ncclDevrWindowRegisterInGroup(comm, ceDevBase, ceDevBaseSize, NCCL_WIN_COLL_SYMMETRIC, &ceWinDev), ret, fail); + NCCLCHECKGOTO(ncclShadowPoolToHost(&comm->devrState.shadows, ceWinDev, &ceWinDevHost), ret, fail); + // Get the ncclDevrWindow from the winHost field + comm->ceColl.ceSyncWin = (struct ncclDevrWindow*)ceWinDevHost->winHost; + + comm->ceColl.baseUCSymReadyOffset = 0; + comm->ceColl.baseUCSymComplOffset = alignUp(comm->nRanks*sizeof(uint32_t), 16); + comm->ceColl.baseUCSymReadyPtr = (uint8_t*)comm->ceColl.ceSyncWin->userPtr + comm->ceColl.baseUCSymReadyOffset; + comm->ceColl.baseUCSymComplPtr = (uint8_t*)comm->ceColl.ceSyncWin->userPtr + comm->ceColl.baseUCSymComplOffset; + comm->ceColl.ceSeqNum = 0; + comm->ceColl.useCompletePtr = false; + comm->ceColl.intraBatchSyncFreq = CE_COLL_INTRA_BATCH_SYNC_FREQ; + comm->ceColl.intraBatchSyncMsgThreshold = CE_COLL_INTRA_BATCH_SYNC_MSG_THRESHOLD; + INFO(NCCL_INIT, "Init CE, rank %d baseUCSymReadyPtr %p, baseUCSymComplPtr %p, seq num %d", comm->rank, comm->ceColl.baseUCSymReadyPtr, comm->ceColl.baseUCSymComplPtr, comm->ceColl.ceSeqNum); + +exit: + return ret; +fail: + goto exit; +} + +ncclResult_t ncclCeFinalize(struct ncclComm* comm) { + ncclResult_t ret = ncclSuccess; + + // Clean up ceInitTaskQueue + while (!ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) { + struct ncclCeInitTask* task = ncclIntruQueueDequeue(&comm->ceInitTaskQueue); + free(task); + } + + // Clean up CE resources + if (comm->ceColl.baseUCSymReadyPtr != NULL) { + if (comm->ceColl.ceSyncWin && comm->ceColl.ceSyncWin->vidmem) { + NCCLCHECKGOTO(ncclCommWindowDeregister(comm, comm->ceColl.ceSyncWin->vidmem), ret, fail); + NCCLCHECKGOTO(ncclMemFree(comm->ceColl.baseUCSymReadyPtr), ret, fail); + } + comm->ceColl.baseUCSymReadyPtr = NULL; + comm->ceColl.baseUCSymComplPtr = NULL; + comm->ceColl.ceSyncWin = NULL; + } + +exit: + return ret; +fail: + goto exit; +} + +bool ncclCeImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty) { + int driverVersion; + if (ncclCudaDriverVersion(&driverVersion) != ncclSuccess) return false; + + // CE is supported in CUDA 12.5 and later + if (driverVersion >= 12050) { + switch (coll) { + case ncclFuncAllGather: + case ncclFuncAlltoAll: + case ncclFuncScatter: + case ncclFuncGather: + return true; + default: + return false; + } + } + return false; +} + +ncclResult_t ncclPrepMCSync(struct ncclComm* comm, bool isComplete, CUstreamBatchMemOpParams* batchParams, size_t* opIdx, cudaStream_t stream) { + ncclResult_t ret = ncclSuccess; + + uint32_t* readyPtrs = (uint32_t*)comm->ceColl.baseUCSymReadyPtr; + uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr; + + bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph); + uint32_t currentSeq = ++comm->ceColl.ceSeqNum; + + // Source pointer is either the constant graph sync value or the sequence number + void* srcPtr = capturing ? (void*)&GRAPH_SYNC_VALUE : (void*)¤tSeq; + // Wait value is either the constant graph sync value or the sequence number + uint32_t waitValue = capturing ? GRAPH_SYNC_VALUE : currentSeq; + + // Use multi-cast address as destination pointer + void* mcDstPtr; + void* dstPtr = isComplete ? (void*)&completePtrs[comm->rank] : (void*)&readyPtrs[comm->rank]; + size_t offset = (uint8_t*)dstPtr - (uint8_t*)comm->ceColl.ceSyncWin->userPtr; + NCCLCHECKGOTO(ncclDevrGetLsaTeamPtrMC(comm, comm->ceColl.ceSyncWin, offset, ncclTeamLsa(comm), &mcDstPtr), ret, fail); + + // Write our own ready/complete flag to the multi-cast address + CUDACHECKGOTO(cudaMemcpyAsync( + mcDstPtr, + srcPtr, + sizeof(uint32_t), + cudaMemcpyHostToDevice, + stream), ret, fail); + + // Add local wait operations for every other rank + for (int r = 0; r < comm->nRanks; ++r) { + if (r == comm->rank) continue; + batchParams[*opIdx] = {}; + batchParams[*opIdx].waitValue.operation = CU_STREAM_MEM_OP_WAIT_VALUE_32; + batchParams[*opIdx].waitValue.address = (CUdeviceptr)(isComplete ? (void*)&completePtrs[r] : (void*)&readyPtrs[r]); + batchParams[*opIdx].waitValue.value = waitValue; + batchParams[*opIdx].waitValue.flags = CU_STREAM_WAIT_VALUE_EQ; + (*opIdx)++; + } + +exit: + return ret; +fail: + goto exit; +} + +ncclResult_t ncclPrepUCSync(struct ncclComm* comm, bool isComplete, + CUstreamBatchMemOpParams* batchParams, + size_t* opIdx) { + ncclResult_t ret = ncclSuccess; + + uint32_t* readyPtrs = (uint32_t*)comm->ceColl.baseUCSymReadyPtr; + uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr; + + bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph); + uint32_t currentSeq = ++comm->ceColl.ceSeqNum; + + // Write our own ready/complete flag to remote ranks + uint32_t waitValue = capturing ? GRAPH_SYNC_VALUE : currentSeq; + for (int r = 0; r < comm->nRanks; ++r) { + if (r == comm->rank) continue; + void * peerDstPtr; + void* dstPtr = isComplete ? (void*)&completePtrs[comm->rank] : (void*)&readyPtrs[comm->rank]; + size_t offset = (uint8_t*)dstPtr - (uint8_t*)comm->ceColl.ceSyncWin->userPtr; + NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, comm->ceColl.ceSyncWin, offset, r, &peerDstPtr), ret, fail); + batchParams[*opIdx] = {}; + batchParams[*opIdx].writeValue.operation = CU_STREAM_MEM_OP_WRITE_VALUE_32; + batchParams[*opIdx].writeValue.address = (CUdeviceptr)peerDstPtr; + batchParams[*opIdx].writeValue.value = waitValue; + batchParams[*opIdx].writeValue.flags = CU_STREAM_WRITE_VALUE_DEFAULT; + (*opIdx)++; + } + + // Add local wait operations for every other rank + for (int r = 0; r < comm->nRanks; ++r) { + if (r == comm->rank) continue; + batchParams[*opIdx] = {}; + batchParams[*opIdx].waitValue.operation = CU_STREAM_MEM_OP_WAIT_VALUE_32; + batchParams[*opIdx].waitValue.address = (CUdeviceptr)(isComplete ? (void*)&completePtrs[r] : (void*)&readyPtrs[r]); + batchParams[*opIdx].waitValue.value = waitValue; + batchParams[*opIdx].waitValue.flags = CU_STREAM_WAIT_VALUE_EQ; + (*opIdx)++; + } + +exit: + return ret; +fail: + goto exit; +} + + +ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) { + ncclResult_t ret = ncclSuccess; + + // Get pointers to the ready and complete synchronization arrays + uint32_t* readyPtrs = (uint32_t*)comm->ceColl.baseUCSymReadyPtr; + uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr; + + // Allocate enough slots for all possible ops + size_t batchSize = (comm->nvlsSupport ? NCCL_CE_SYNC_OPS_PER_RANK_MC : NCCL_CE_SYNC_OPS_PER_RANK_UC) * comm->nRanks; + size_t opIdx = 0; + + // Prepare batch memory operations for synchronization + CUstreamBatchMemOpParams* batchParams = nullptr; + NCCLCHECKGOTO(ncclCalloc(&batchParams, batchSize), ret, fail); + + if (comm->nvlsSupport) { + NCCLCHECKGOTO(ncclPrepMCSync(comm, comm->ceColl.useCompletePtr, batchParams, &opIdx, stream), ret, fail); + } else { + NCCLCHECKGOTO(ncclPrepUCSync(comm, comm->ceColl.useCompletePtr, batchParams, &opIdx), ret, fail); + } + + // For CUDA graph capture, add reset operation + if (ncclCudaGraphValid(comm->planner.capturingGraph)) { + for (int i = 0; i < comm->nRanks; i++) { + batchParams[opIdx] = {}; + batchParams[opIdx].writeValue.operation = CU_STREAM_MEM_OP_WRITE_VALUE_32; + batchParams[opIdx].writeValue.address = (CUdeviceptr)(comm->ceColl.useCompletePtr ? (void*)&completePtrs[i] : (void*)&readyPtrs[i]); + batchParams[opIdx].writeValue.value = 0; + batchParams[opIdx].writeValue.flags = CU_STREAM_WRITE_VALUE_DEFAULT; + opIdx++; + } + } + + // Execute all memory operations in a single batch + CUCHECKGOTO(cuStreamBatchMemOp(stream, opIdx, batchParams, 0), ret, fail); + + // Toggle the flag for next call + comm->ceColl.useCompletePtr = !comm->ceColl.useCompletePtr; + +exit: + if (batchParams) free(batchParams); + return ret; +fail: + goto exit; +} + +ncclResult_t ncclCeInitBatchOpsParams(struct ncclCeBatchOpsParams* params, int nRanks) { + ncclResult_t ret = ncclSuccess; + + params->srcs = nullptr; + params->dsts = nullptr; + params->sizes = nullptr; + params->numOps = 0; + params->intraBatchSync = false; +#if CUDART_VERSION >= 12080 + params->attrs = nullptr; + params->attrIdxs = nullptr; + params->numAttrs = 0; +#endif + + NCCLCHECKGOTO(ncclCalloc(¶ms->srcs, nRanks), ret, fail); + NCCLCHECKGOTO(ncclCalloc(¶ms->dsts, nRanks), ret, fail); + NCCLCHECKGOTO(ncclCalloc(¶ms->sizes, nRanks), ret, fail); +#if CUDART_VERSION >= 12080 + NCCLCHECKGOTO(ncclCalloc(¶ms->attrs, nRanks), ret, fail); + NCCLCHECKGOTO(ncclCalloc(¶ms->attrIdxs, nRanks), ret, fail); +#endif +exit: + return ret; +fail: + goto exit; +} + +void ncclCeFreeBatchOpsParams(struct ncclCeBatchOpsParams* params) { + if (params->srcs) free(params->srcs); + if (params->dsts) free(params->dsts); + if (params->sizes) free(params->sizes); +#if CUDART_VERSION >= 12080 + if (params->attrs) free(params->attrs); + if (params->attrIdxs) free(params->attrIdxs); +#endif +} + +ncclResult_t ncclCeLaunchBatchOps(struct ncclComm* comm, struct ncclCeBatchOpsParams* params, cudaStream_t stream) { + ncclResult_t ret = ncclSuccess; + + // Check if there are any operations to perform + if (params->numOps == 0) { + return ncclSuccess; + } + + // Check if we are in a CUDA graph capture + bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph); + + int driverVersion; + NCCLCHECKGOTO(ncclCudaDriverVersion(&driverVersion), ret, fail); + + //--------------Graph capture-------------- + // cudaMemcpyBatchAsync is not supported during CUDA graph capture + if (capturing) { + for (int i =0; i < params->numOps; i++) { + CUDACHECKGOTO(cudaMemcpyAsync( + (void*)params->dsts[i], + (void*)params->srcs[i], + params->sizes[i], + cudaMemcpyDeviceToDevice, + stream), ret, fail); + + if (params->intraBatchSync && ((i+1) % comm->ceColl.intraBatchSyncFreq == 0) && ((i+1) < params->numOps)) { + NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail); + } + } + } + //--------------No graph capture-------------- + else { + if (CUDART_VERSION >= 12080 && driverVersion >= 12080) { +#if CUDART_VERSION >= 12080 + // For CUDA 12.8+, use batch memory copy for better performance + params->attrs[0] = {}; + params->attrs[0].srcAccessOrder = cudaMemcpySrcAccessOrderStream; + params->attrs[0].flags = cudaMemcpyFlagPreferOverlapWithCompute; + params->attrIdxs[0] = 0; + params->numAttrs = 1; + + if (params->intraBatchSync) { + // Break into multiple batches with sync between them + int batchSize = comm->ceColl.intraBatchSyncFreq; + for (int i = 0; i < params->numOps; i += batchSize) { + int currentBatchSize = (i + batchSize <= params->numOps) ? batchSize : params->numOps - i; + + #if CUDART_VERSION >= 13000 + CUDACHECKGOTO(cudaMemcpyBatchAsync( + ¶ms->dsts[i], ¶ms->srcs[i], ¶ms->sizes[i], currentBatchSize, + params->attrs, params->attrIdxs, params->numAttrs, stream), ret, fail); + #else + CUDACHECKGOTO(cudaMemcpyBatchAsync( + ¶ms->dsts[i], ¶ms->srcs[i], ¶ms->sizes[i], currentBatchSize, + params->attrs, params->attrIdxs, params->numAttrs, nullptr, stream), ret, fail); + #endif + + // Sync after each batch + if (i + batchSize < params->numOps) { + NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail); + } + } + } else { + // Use single batch for all operations + #if CUDART_VERSION >= 13000 + CUDACHECKGOTO(cudaMemcpyBatchAsync( + params->dsts, params->srcs, params->sizes, params->numOps, + params->attrs, params->attrIdxs, params->numAttrs, stream), ret, fail); + #else + CUDACHECKGOTO(cudaMemcpyBatchAsync( + params->dsts, params->srcs, params->sizes, params->numOps, + params->attrs, params->attrIdxs, params->numAttrs, nullptr, stream), ret, fail); + #endif + } +#endif + } else { + // For older CUDA versions, fall back to individual transfers + for (int i = 0; i < params->numOps; i++) { + CUDACHECKGOTO(cudaMemcpyAsync( + (void*)params->dsts[i], + (void*)params->srcs[i], + params->sizes[i], + cudaMemcpyDeviceToDevice, + stream), ret, fail); + + if (params->intraBatchSync && ((i+1) % comm->ceColl.intraBatchSyncFreq == 0) && ((i+1) < params->numOps)) { + NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail); + } + } + } + } + +exit: + return ret; +fail: + goto exit; +} + + +ncclResult_t ncclCeAllGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) { + ncclResult_t ret = ncclSuccess; + + // Calculate the size of each rank's data chunk + const size_t chunkBytes = args->nElts * args->eltSize; + uint8_t* mySendBuff = (uint8_t*)args->sendBuff; + uint8_t* myRecvBuff = (uint8_t*)args->recvBuff + comm->rank * chunkBytes; + void* peerRecvBuff; + size_t offset; + + struct ncclCeBatchOpsParams batchOpsParams = {}; + NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks), ret, fail); + + // Ensure all ranks are ready before starting transfers + NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail); + + // Copy own data to receive buffer if operation is out-of-place + if (myRecvBuff != mySendBuff) { + batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff; + batchOpsParams.dsts[batchOpsParams.numOps] = (void*)myRecvBuff; + batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes; + batchOpsParams.numOps++; + } + + // Copy data to other ranks + for (int r = 1; r < comm->nRanks; r++) { + int targetRank = (comm->rank + r) % comm->nRanks; + offset = myRecvBuff - (uint8_t*)args->recvWin->userPtr; + NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, targetRank, &peerRecvBuff), ret, fail); + batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff; + batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff; + batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes; + batchOpsParams.numOps++; + } + + // Check if we need to perform intra-batch synchronization + batchOpsParams.intraBatchSync = (batchOpsParams.numOps > comm->ceColl.intraBatchSyncFreq && chunkBytes*batchOpsParams.numOps >= comm->ceColl.intraBatchSyncMsgThreshold); + + // Launch the batch operations + NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail); + + // Ensure all transfers are complete across all ranks + NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail); + +exit: + ncclCeFreeBatchOpsParams(&batchOpsParams); + return ret; +fail: + goto exit; +} + +ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) { + ncclResult_t ret = ncclSuccess; + + // Calculate the size of data each rank sends to every other rank + const size_t chunkBytes = args->nElts * args->eltSize; + uint8_t* mySendBuff = (uint8_t*)args->sendBuff; + uint8_t* myRecvBuff = (uint8_t*)args->recvBuff; + void* peerRecvBuff; + size_t offset; + + struct ncclCeBatchOpsParams batchOpsParams = {}; + NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks * comm->nRanks), ret, fail); + + // Ensure all ranks are ready before starting transfers + NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail); + + // Copy data to other ranks: send data chunk for each destination rank + for (int r = 0; r < comm->nRanks; r++) { + int dstRank = (comm->rank + r) % comm->nRanks; + uint8_t* srcPtr = mySendBuff + dstRank * chunkBytes; + uint8_t* dstPtr = myRecvBuff + comm->rank * chunkBytes; + + if (dstRank == comm->rank) { + // Local copy for own data + batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr; + batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr; + batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes; + batchOpsParams.numOps++; + } else { + // Remote copy to other ranks: send to rank dstRank's receive buffer at position comm->rank + offset = dstPtr - (uint8_t*)args->recvWin->userPtr; + NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, dstRank, &peerRecvBuff), ret, fail); + batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr; + batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff; + batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes; + batchOpsParams.numOps++; + } + } + + // Check if we need to perform intra-batch synchronization + batchOpsParams.intraBatchSync = (batchOpsParams.numOps > comm->ceColl.intraBatchSyncFreq && chunkBytes*batchOpsParams.numOps >= comm->ceColl.intraBatchSyncMsgThreshold); + + // Launch the batch operations + NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail); + + // Ensure all transfers are complete across all ranks + NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail); + +exit: + ncclCeFreeBatchOpsParams(&batchOpsParams); + return ret; +fail: + goto exit; +} + +ncclResult_t ncclCeScatter(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) { + ncclResult_t ret = ncclSuccess; + + // Calculate the size of data root sends to each rank + const size_t chunkBytes = args->nElts * args->eltSize; + uint8_t* mySendBuff = (uint8_t*)args->sendBuff; + uint8_t* myRecvBuff = (uint8_t*)args->recvBuff; + int rootRank = args->rootRank; + void* peerDstPtr; + size_t offset; + + struct ncclCeBatchOpsParams batchOpsParams = {}; + NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks), ret, fail); + + // Ensure all ranks are ready before starting transfers + NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail); + + if (comm->rank == rootRank) { + // Check if this is an in-place scatter operation + bool isInPlace = (myRecvBuff == mySendBuff + comm->rank * chunkBytes); + + // Copy root's own data first if not in-place + if (!isInPlace) { + uint8_t* srcPtr = mySendBuff + comm->rank * chunkBytes; + uint8_t* dstPtr = myRecvBuff; + batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr; + batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr; + batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes; + batchOpsParams.numOps++; + } + + // Root rank distributes data to other ranks + for (int r = 1; r < comm->nRanks; r++) { + int dstRank = (comm->rank + r) % comm->nRanks; + uint8_t* srcPtr = mySendBuff + dstRank * chunkBytes; + uint8_t* dstPtr = isInPlace ? myRecvBuff + dstRank * chunkBytes : myRecvBuff; + + offset = dstPtr - (uint8_t*)args->recvWin->userPtr; + NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, dstRank, &peerDstPtr), ret, fail); + batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr; + batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerDstPtr; + batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes; + batchOpsParams.numOps++; + } + } + // Non-root ranks don't need to perform any copy operations + + // Launch the batch operations + NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail); + + // Ensure all transfers are complete across all ranks + NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail); + +exit: + ncclCeFreeBatchOpsParams(&batchOpsParams); + return ret; +fail: + goto exit; +} + +ncclResult_t ncclCeGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) { + ncclResult_t ret = ncclSuccess; + + // Calculate the size of data each rank sends to root + const size_t chunkBytes = args->nElts * args->eltSize; + uint8_t* mySendBuff = (uint8_t*)args->sendBuff; + uint8_t* myRecvBuff = (uint8_t*)args->recvBuff; + int rootRank = args->rootRank; + void* peerRecvBuff; + size_t offset; + + struct ncclCeBatchOpsParams batchOpsParams = {}; + NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, 1), ret, fail); + + // Ensure all ranks are ready before starting transfers + NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail); + + if (comm->rank == rootRank) { + // Root rank copies its own data to the correct position in receive buffer + uint8_t* dstPtr = myRecvBuff + comm->rank * chunkBytes; + if (mySendBuff != dstPtr) { + batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff; + batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr; + batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes; + batchOpsParams.numOps++; + } + } else { + // Non-root ranks send their data to root's receive buffer + uint8_t* rootRecvPtr = (uint8_t*)args->recvBuff + comm->rank * chunkBytes; + offset = rootRecvPtr - (uint8_t*)args->recvWin->userPtr; + NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, rootRank, &peerRecvBuff), ret, fail); + batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff; + batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff; + batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes; + batchOpsParams.numOps++; + } + + // Launch the batch operations + NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail); + + // Ensure all transfers are complete across all ranks + NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail); + +exit: + ncclCeFreeBatchOpsParams(&batchOpsParams); + return ret; +fail: + goto exit; +} + +ncclResult_t ncclLaunchCeColl(struct ncclComm* comm, struct ncclKernelPlan* plan) { + ncclResult_t ret = ncclSuccess; + cudaStream_t stream = comm->planner.streams->stream; + struct ncclCeCollArgs* args = plan->ceCollArgs; + + switch (args->func) { + case ncclFuncAllGather: + NCCLCHECKGOTO(ncclCeAllGather(comm, args, stream), ret, fail); + break; + case ncclFuncAlltoAll: + NCCLCHECKGOTO(ncclCeAlltoAll(comm, args, stream), ret, fail); + break; + case ncclFuncScatter: + NCCLCHECKGOTO(ncclCeScatter(comm, args, stream), ret, fail); + break; + case ncclFuncGather: + NCCLCHECKGOTO(ncclCeGather(comm, args, stream), ret, fail); + break; + default: + ret = ncclInvalidUsage; + } + +exit: + return ret; +fail: + goto exit; +} diff --git a/src/collectives.cc b/src/collectives.cc index 03122f8a7..ca69c9a78 100644 --- a/src/collectives.cc +++ b/src/collectives.cc @@ -14,10 +14,13 @@ const char* ncclFuncToString(ncclFunc_t fn) { switch (fn) { case ncclFuncAllGather: return "AllGather"; case ncclFuncAllReduce: return "AllReduce"; + case ncclFuncAlltoAll: return "AlltoAll"; case ncclFuncBroadcast: return "Broadcast"; + case ncclFuncGather: return "Gather"; case ncclFuncRecv: return "Recv"; case ncclFuncReduce: return "Reduce"; case ncclFuncReduceScatter: return "ReduceScatter"; + case ncclFuncScatter: return "Scatter"; case ncclFuncSendRecv: return "SendRecv"; case ncclFuncSend: return "Send"; default: return "Invalid"; @@ -88,6 +91,19 @@ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcoun return ncclEnqueueCheck(&info); } +NCCL_API(ncclResult_t, ncclAlltoAll, const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclComm* comm, cudaStream_t stream); +ncclResult_t ncclAlltoAll(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclComm* comm, cudaStream_t stream) { + NVTX3_FUNC_WITH_PARAMS(AlltoAll, NcclNvtxParamsAlltoAll, + NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype))); + + struct ncclInfo info = { ncclFuncAlltoAll, "AlltoAll", + sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */ + ALLTOALL_CHUNKSTEPS, ALLTOALL_SLICESTEPS }; + return ncclEnqueueCheck(&info); +} + NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, @@ -121,6 +137,19 @@ ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int ro return ncclBroadcast(buff, buff, count, datatype, root, comm, stream); } +NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + ncclComm* comm, cudaStream_t stream); +ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + ncclComm* comm, cudaStream_t stream) { + NVTX3_FUNC_WITH_PARAMS(Gather, NcclNvtxParamsGather, + NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root)); + + struct ncclInfo info = { ncclFuncGather, "Gather", + sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */ + GATHER_CHUNKSTEPS, GATHER_SLICESTEPS }; + return ncclEnqueueCheck(&info); +} + NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, @@ -147,6 +176,19 @@ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recv return ncclEnqueueCheck(&info); } +NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, int root, ncclComm* comm, cudaStream_t stream); +ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, int root, ncclComm* comm, cudaStream_t stream) { + NVTX3_FUNC_WITH_PARAMS(Scatter, NcclNvtxParamsScatter, + NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root)); + + struct ncclInfo info = { ncclFuncScatter, "Scatter", + sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */ + SCATTER_CHUNKSTEPS, SCATTER_SLICESTEPS }; + return ncclEnqueueCheck(&info); +} + NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, diff --git a/src/debug.cc b/src/debug.cc index f034bc7e0..0d6ed8400 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -15,6 +15,7 @@ #include #include #include "param.h" +#include #define NCCL_DEBUG_RESET_TRIGGERED (-2) @@ -28,9 +29,9 @@ static int pid = -1; static char hostname[1024]; thread_local int ncclDebugNoWarn = 0; char ncclLastError[1024] = ""; // Global string for the last error in human readable form -static uint64_t ncclDebugMask = 0; +uint64_t ncclDebugMask = 0; FILE *ncclDebugFile = stdout; -static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER; +static std::mutex ncclDebugMutex; static std::chrono::steady_clock::time_point ncclEpoch; static bool ncclWarnSetDebugInfo = false; @@ -269,15 +270,13 @@ static void ncclDebugInit() { * they can share the debugging mechanisms and output files */ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) { - bool locked = false; // Keeps track of the ncclDebugLock state. int gotLevel = __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE); if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; } // Save the last error (WARN) as a human readable string if (level == NCCL_LOG_WARN) { - pthread_mutex_lock(&ncclDebugLock); - locked = true; + std::lock_guard lock(ncclDebugMutex); va_list vargs; va_start(vargs, fmt); (void) vsnprintf(ncclLastError, sizeof(ncclLastError), fmt, vargs); @@ -285,20 +284,13 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file } if (gotLevel >= 0 && (gotLevel < level || (flags & ncclDebugMask) == 0)) { - if (locked) - pthread_mutex_unlock(&ncclDebugLock); return; } - if (!locked) { - pthread_mutex_lock(&ncclDebugLock); - locked = true; - } - // From this point on ncclDebugLock is always locked so we don't need to check "locked" anymore. + std::lock_guard lock(ncclDebugMutex); if (ncclDebugLevel < 0) ncclDebugInit(); if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) { - pthread_mutex_unlock(&ncclDebugLock); return; } @@ -386,17 +378,35 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file // necessary since we write bytes instead of the string. buffer[len++] = '\n'; fwrite(buffer, 1, len, ncclDebugFile); - pthread_mutex_unlock(&ncclDebugLock); } -NCCL_API(void, ncclResetDebugInit); -void ncclResetDebugInit() { +// Non-deprecated version for internal use. +extern "C" +__attribute__ ((visibility("default"))) +void ncclResetDebugInitInternal() { // Cleans up from a previous ncclDebugInit() and reruns. // Use this after changing NCCL_DEBUG and related parameters in the environment. - pthread_mutex_lock(&ncclDebugLock); + std::lock_guard lock(ncclDebugMutex); // Let ncclDebugInit() know to complete the reset. __atomic_store_n(&ncclDebugLevel, NCCL_DEBUG_RESET_TRIGGERED, __ATOMIC_RELEASE); - pthread_mutex_unlock(&ncclDebugLock); +} + +// In place of: NCCL_API(void, ncclResetDebugInit); +__attribute__ ((visibility("default"))) +__attribute__ ((alias("ncclResetDebugInit"))) +void pncclResetDebugInit(); +extern "C" +__attribute__ ((visibility("default"))) +__attribute__ ((weak)) +__attribute__ ((deprecated("ncclResetDebugInit is not supported as part of the NCCL API and will be removed in the future"))) +void ncclResetDebugInit(); + + +void ncclResetDebugInit() { + // This is now deprecated as part of the NCCL API. It will be removed + // from the API in the future. It is still available as an + // exported symbol. + ncclResetDebugInitInternal(); } NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0); diff --git a/src/dev_runtime.cc b/src/dev_runtime.cc new file mode 100644 index 000000000..54e6e01bf --- /dev/null +++ b/src/dev_runtime.cc @@ -0,0 +1,995 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "dev_runtime.h" +#include "comm.h" +#include "device.h" +#include "transport.h" +#include "group.h" +#include "nccl_device.h" + +NCCL_PARAM(WinStride, "WIN_STRIDE", -1); + +// Complete types from src/include/dev_runtime.h +struct ncclDevrMemory { + int refCount; + struct ncclDevrMemory* next; + CUmemGenericAllocationHandle memHandle; + size_t size; + size_t bigOffset; // offset in big VA space +}; + +struct ncclDevrWindowSorted { + uintptr_t userAddr; + size_t size; + struct ncclDevrWindow* win; +}; + +struct ncclDevrTeam { + struct ncclDevrTeam* next; + struct ncclTeam team; + CUmemGenericAllocationHandle mcHandle; + void* mcBasePtr; + int worldRankList[]; +}; + +//////////////////////////////////////////////////////////////////////////////// +// Helpers at the bottom: + +// Find least index such that `arg < sorted[i].key` (least upper bound) +template +static int listFindSortedLub(Key Obj::*key, Obj* sorted, int count, Key arg); + +template +static void listInsert(Obj** list, int* capacity, int* count, int index, Obj val); + +template +static void listRemove(Obj* list, int* count, int index); + +//////////////////////////////////////////////////////////////////////////////// + +ncclResult_t ncclDevrInitOnce(struct ncclComm* comm) { + ncclResult_t ret = ncclSuccess; + struct ncclDevrState* devr = &comm->devrState; + if (devr->bigSize != 0) return ncclSuccess; + + bool lsaIsLocal = true; + for (int i=0; i < comm->localRanks; i++) { + lsaIsLocal &= comm->localRankToRank[i] == comm->localRankToRank[0] + i; + } + devr->lsaSelf = lsaIsLocal ? comm->localRank : 0; + devr->lsaSize = lsaIsLocal ? comm->localRanks : 1; + devr->lsaRankList = (int*)malloc(devr->lsaSize*sizeof(int)); + for (int i=0; i < devr->lsaSize; i++) { + devr->lsaRankList[i] = comm->rank + (i - devr->lsaSelf); + } + + CUmemAllocationProp memProp = {}; + memProp.type = CU_MEM_ALLOCATION_TYPE_PINNED; + memProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + memProp.requestedHandleTypes = ncclCuMemHandleType; + memProp.location.id = comm->cudaDev; + CUCHECKGOTO(cuMemGetAllocationGranularity(&devr->granularity, &memProp, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail_lsaRankList); + + devr->bigSize = ncclParamWinStride(); + if (-devr->bigSize <= 1) { + devr->bigSize = 1; + for (int r=0; r < comm->nRanks; ++r) { + devr->bigSize = std::max(devr->bigSize, comm->peerInfo[r].totalGlobalMem); + } + } + devr->bigSize = alignUp(devr->bigSize, size_t(1)<<32); + INFO(NCCL_INIT, "Symmetric VA size=%ldGB", (long)devr->bigSize>>30); + + ncclSpaceConstruct(&devr->bigSpace); + ncclShadowPoolConstruct(&devr->shadows); + return ncclSuccess; + +fail_lsaRankList: + free(devr->lsaRankList); + return ret; +} + +static void symTeamDestroyAll(struct ncclComm* comm); // Further down + +ncclResult_t ncclDevrFinalize(struct ncclComm* comm) { + struct ncclDevrState* devr = &comm->devrState; + if (devr->bigSize == 0) return ncclSuccess; + + while (!ncclIntruQueueEmpty(&devr->regTaskQueue)) { + struct ncclDevrRegTask* task = ncclIntruQueueDequeue(&devr->regTaskQueue); + free(task); + } + + symTeamDestroyAll(comm); + { // delete windowTable + cudaStream_t stream; + if (cudaSuccess == cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)) { + struct ncclDevCommWindowTable* tableDev = devr->windowTable; + while (tableDev != nullptr) { + struct ncclDevCommWindowTable* tableHost; + if (ncclSuccess != ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost)) break; + struct ncclDevCommWindowTable* next = tableHost->next; + ncclShadowPoolFree(&devr->shadows, tableDev, stream); + tableDev = next; + } + cudaStreamSynchronize(stream); + cudaStreamDestroy(stream); + } + } + CUdeviceptr flatAddr = reinterpret_cast(devr->lsaFlatBase); + CUCHECKIGNORE(cuMemUnmap(flatAddr, devr->lsaSize*devr->bigSize)); + CUCHECKIGNORE(cuMemAddressFree(flatAddr, devr->lsaSize*devr->bigSize)); + ncclShadowPoolDestruct(&devr->shadows); + ncclSpaceDestruct(&devr->bigSpace); + free(devr->lsaRankList); + free(devr->winSorted); + return ncclSuccess; +} + +//////////////////////////////////////////////////////////////////////////////// + +static ncclResult_t symMemoryMapLsaTeam( + struct ncclComm* comm, CUmemGenericAllocationHandle memHandle, size_t size, size_t bigOffset + ) { + ncclResult_t ret = ncclSuccess; + struct ncclDevrState* devr = &comm->devrState; + CUmemAccessDesc accessDesc = {}; + union Message { + CUmemGenericAllocationHandle memHandle; + CUmemFabricHandle fabricHandle; + }; + + Message* messages = (Message*)calloc(devr->lsaSize, sizeof(Message)); + if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { + messages[devr->lsaSelf].memHandle = memHandle; + } else { + CUCHECKGOTO(cuMemExportToShareableHandle(&messages[devr->lsaSelf].fabricHandle, memHandle, ncclCuMemHandleType, 0), ret, fail); + } + + NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, devr->lsaRankList, devr->lsaSelf, devr->lsaSize, messages, sizeof(Message)), ret, fail); + + if (devr->lsaFlatBase == nullptr) { // Create on first need. + CUdeviceptr addr; + CUCHECKGOTO(cuMemAddressReserve(&addr, devr->lsaSize*devr->bigSize, NCCL_MAX_PAGE_SIZE, 0, 0), ret, fail); + devr->lsaFlatBase = reinterpret_cast(addr); + } + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = comm->cudaDev; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + for (int r = 0; r < devr->lsaSize; r++) { + CUmemGenericAllocationHandle impHandle; + if (r == devr->lsaSelf) { + impHandle = memHandle; + } else { + if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { + int fd = -1; + NCCLCHECKGOTO(ncclProxyClientGetFdBlocking(comm, devr->lsaRankList[r], &messages[r], &fd), ret, fail); + CUCHECKGOTO(cuMemImportFromShareableHandle(&impHandle, reinterpret_cast((uintptr_t)fd), ncclCuMemHandleType), ret, fail); + SYSCHECKGOTO(close(fd), "close", ret, fail); + } else { + CUCHECKGOTO(cuMemImportFromShareableHandle(&impHandle, (void*)&messages[r].fabricHandle, ncclCuMemHandleType), ret, fail); + } + } + CUdeviceptr addr = reinterpret_cast((char*)devr->lsaFlatBase + r*devr->bigSize + bigOffset); + CUCHECKGOTO(cuMemMap(addr, size, 0, impHandle, 0), ret, fail); + CUCHECKGOTO(cuMemSetAccess(addr, size, &accessDesc, 1), ret, fail); + if (r != devr->lsaSelf) { + CUCHECKGOTO(cuMemRelease(impHandle), ret, fail); + } + } + // Ensure everyone has imported my mem handle. + NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, devr->lsaRankList, devr->lsaSelf, devr->lsaSize, 0xbeef), ret, fail); +leave: + free(messages); + return ret; +fail: + goto leave; +} + +static ncclResult_t symBindTeamMemory( + struct ncclComm* comm, struct ncclDevrTeam* tm, struct ncclDevrMemory* mem + ) { + if (comm->nvlsSupport && tm->mcBasePtr != nullptr) { + #if CUDART_VERSION >= 12010 + INFO(NCCL_NVLS, "Binding multicast memory at big=%lx to team {%d x %d}", mem->bigOffset, tm->team.nRanks, tm->team.stride); + CUCHECK(cuMulticastBindMem(tm->mcHandle, mem->bigOffset, mem->memHandle, 0, mem->size, 0)); + #endif + } + return ncclSuccess; +} + +static ncclResult_t symUnbindTeamMemory( + struct ncclComm* comm, struct ncclDevrTeam* tm, struct ncclDevrMemory* mem + ) { + if (comm->nvlsSupport && tm->mcBasePtr != nullptr) { + #if CUDART_VERSION >= 12010 + CUCHECK(cuMulticastUnbind(tm->mcHandle, comm->cudaDev, mem->bigOffset, mem->size)); + #endif + } + return ncclSuccess; +} + +// Caller must barrier the team afterward. +static ncclResult_t symTeamObtain( + struct ncclComm* comm, struct ncclTeam team, bool multimem, + struct ncclDevrTeam** outTeam + ) { + ncclResult_t ret = ncclSuccess; + struct ncclDevrState* devr = &comm->devrState; + struct ncclDevrTeam* t = devr->teamHead; + bool teamIsNew = false; + while (true) { + if (t == nullptr) { + teamIsNew = true; + t = (struct ncclDevrTeam*)malloc(sizeof(struct ncclDevrTeam) + team.nRanks*sizeof(int)); + t->team = team; + t->mcHandle = 0x0; + t->mcBasePtr = nullptr; + for (int i=0; i < team.nRanks; i++) { + t->worldRankList[i] = comm->rank + (i - team.rank)*team.stride; + } + break; + } else if (t->team.rank == team.rank && t->team.nRanks == team.nRanks && t->team.stride == team.stride) { + if (!multimem || t->mcBasePtr != nullptr) { + // Matching team is sufficient + if (outTeam) *outTeam = t; + return ncclSuccess; + } + break; // Need to enable multimem + } + } + + if (multimem) { + if (!comm->nvlsSupport) { + WARN("Multicast support requested for team but none available on system."); + ret = ncclInvalidArgument; + goto fail; + } else { + #if CUDART_VERSION >= 12010 + CUmemGenericAllocationHandle mcHandle = 0; + CUdeviceptr mcAddr = 0; + CUmulticastObjectProp mcProp = {}; + char shareableHandle[NVLS_HANDLE_SIZE]; + + mcProp.numDevices = team.nRanks; + mcProp.handleTypes = ncclCuMemHandleType; + mcProp.flags = 0; + mcProp.size = devr->bigSize; + if (team.rank == 0) { + NCCLCHECKGOTO(ncclNvlsGroupCreate(comm, &mcProp, team.rank, team.nRanks, &mcHandle, shareableHandle), ret, fail); + NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, t->worldRankList, team.rank, team.nRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail_mcHandle); + } else { + NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, t->worldRankList, team.rank, team.nRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); + NCCLCHECKGOTO(ncclNvlsGroupConnect(comm, shareableHandle, t->worldRankList[0], &mcHandle), ret, fail); + } + + CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->cudaDev), ret, fail_mcHandle); + CUCHECKGOTO(cuMemAddressReserve(&mcAddr, devr->bigSize, NCCL_MAX_PAGE_SIZE, 0, 0), ret, fail_mcHandle); + CUCHECKGOTO(cuMemMap(mcAddr, devr->bigSize, 0, mcHandle, 0), ret, fail_mcHandle_mcAddr); + { CUmemAccessDesc accessDesc = {}; + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = comm->cudaDev; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + CUCHECKGOTO(cuMemSetAccess(mcAddr, devr->bigSize, &accessDesc, 1), ret, fail_mcHandle_mcAddr_unmap); + } + t->mcHandle = mcHandle; + t->mcBasePtr = reinterpret_cast(mcAddr); + + // Bind new team with all existing memories. + for (struct ncclDevrMemory* mem = devr->memHead; mem != nullptr; mem = mem->next) { + NCCLCHECKGOTO(symBindTeamMemory(comm, t, mem), ret, fail_mcHandle_mcAddr_unmap_mems); + } + + if (false) { // Error labels: + fail_mcHandle_mcAddr_unmap_mems: + for (struct ncclDevrMemory* mem = devr->memHead; mem != nullptr; mem = mem->next) { + symUnbindTeamMemory(comm, t, mem); + } + fail_mcHandle_mcAddr_unmap: + CUCHECKIGNORE(cuMemUnmap(mcAddr, devr->bigSize)); + goto fail_mcHandle_mcAddr; // silence unused label warning + fail_mcHandle_mcAddr: + CUCHECKIGNORE(cuMemAddressFree(mcAddr, devr->bigSize)); + goto fail_mcHandle; // silence unused label warning + fail_mcHandle: + CUCHECKIGNORE(cuMemRelease(mcHandle)); + goto fail; // silence unused label warning + } + #else + goto fail; // silence unused label warning + #endif + } + } + + if (teamIsNew) { + // Add to list + t->next = devr->teamHead; + devr->teamHead = t; + } + if (outTeam) *outTeam = t; + return ret; + +fail: + if (teamIsNew) free(t); + return ret; +} + +static void symTeamDestroyAll(struct ncclComm* comm) { + struct ncclDevrState* devr = &comm->devrState; + while (devr->teamHead != nullptr) { + struct ncclDevrTeam* t = devr->teamHead; + devr->teamHead = t->next; + if (t->mcBasePtr != nullptr) { + for (struct ncclDevrMemory* m = devr->memHead; m != nullptr; m = m->next) { + symUnbindTeamMemory(comm, t, m); + } + CUdeviceptr mcAddr = reinterpret_cast(t->mcBasePtr); + CUCHECKIGNORE(cuMemUnmap(mcAddr, devr->bigSize)); + CUCHECKIGNORE(cuMemAddressFree(mcAddr, devr->bigSize)); + CUCHECKIGNORE(cuMemRelease(t->mcHandle)); + } + free(t); + } +} + +// On success we take caller's reference on memHandle. +// Due to multicast binds for each pre-exiting team, this function requires +// caller do a world barrier before returning to user. +static ncclResult_t symMemoryObtain( + struct ncclComm* comm, CUmemGenericAllocationHandle memHandle, size_t size, + struct ncclDevrMemory** outMem + ) { + ncclResult_t ret = ncclSuccess; + struct ncclDevrState* devr = &comm->devrState; + int64_t bigOffset = 0; + + struct ncclDevrMemory* mem = devr->memHead; + while (mem != nullptr) { + if (mem->memHandle == memHandle) { + CUCHECKIGNORE(cuMemRelease(memHandle)); + goto leave; + } + mem = mem->next; + } + // New memory. + mem = (struct ncclDevrMemory*)malloc(sizeof(struct ncclDevrMemory)); + mem->refCount = 0; + mem->memHandle = memHandle; + mem->size = size; + + // Grab offset in the big space. + NCCLCHECKGOTO(ncclSpaceAlloc(&devr->bigSpace, devr->bigSize, size, devr->granularity, &bigOffset), ret, fail_mem); + mem->bigOffset = bigOffset; + + // Map unicast addresses into flat VA space for lsa team. + NCCLCHECKGOTO(symMemoryMapLsaTeam(comm, memHandle, size, bigOffset), ret, fail_mem_space); + + // Bind new memory with each existing team. + for (struct ncclDevrTeam* t = devr->teamHead; t != nullptr; t = t->next) { + NCCLCHECKGOTO(symBindTeamMemory(comm, t, mem), ret, fail_mem_space_teams); + } + // Add to list of mems. + mem->next = devr->memHead; + devr->memHead = mem; + +leave: + mem->refCount += 1; + *outMem = mem; + return ret; + +fail_mem_space_teams: + for (struct ncclDevrTeam* t = devr->teamHead; t != nullptr; t = t->next) { + symUnbindTeamMemory(comm, t, mem); + } +fail_mem_space: + ncclSpaceFree(&devr->bigSpace, bigOffset, size); +fail_mem: + free(mem); +//fail: + return ret; +} + +static void symMemoryDropRef( + struct ncclComm* comm, struct ncclDevrMemory* mem + ) { + if (mem != nullptr && 0 == --mem->refCount) { + struct ncclDevrState* devr = &comm->devrState; + for (struct ncclDevrTeam* t = devr->teamHead; t != nullptr; t = t->next) { + symUnbindTeamMemory(comm, t, mem); + } + for (int r = 0; r < devr->lsaSize; r++) { + CUdeviceptr addr = reinterpret_cast((char*)devr->lsaFlatBase + r*devr->bigSize + mem->bigOffset); + CUCHECKIGNORE(cuMemUnmap(addr, mem->size)); + } + ncclSpaceFree(&devr->bigSpace, mem->bigOffset, mem->size); + CUCHECKIGNORE(cuMemRelease(mem->memHandle)); + + struct ncclDevrMemory** ptr = &devr->memHead; + while (*ptr != mem) ptr = &(*ptr)->next; + *ptr = mem->next; // Remove from list. + + free(mem); + } +} + +static ncclResult_t symWindowTableInitOnce(struct ncclComm* comm, cudaStream_t stream) { + struct ncclDevrState* devr = &comm->devrState; + struct ncclDevCommWindowTable* tableDev = devr->windowTable; + if (tableDev == nullptr) { // Create on first need. + NCCLCHECK(ncclShadowPoolAlloc(&devr->shadows, &tableDev, nullptr, stream)); + devr->windowTable = tableDev; + } + return ncclSuccess; +} + +// On success we take callers reference on `mem`. +static ncclResult_t symWindowCreate( + struct ncclComm* comm, struct ncclDevrMemory* mem, + size_t memOffset, void* userPtr, size_t userSize, int winFlags, void* localReg, + struct ncclWindow_vidmem** outWinDev, struct ncclDevrWindow** outWin, + cudaStream_t stream + ) { + uintptr_t userAddr = reinterpret_cast(userPtr); + struct ncclDevrState* devr = &comm->devrState; + struct ncclDevrWindow* win; + + win = (struct ncclDevrWindow*)malloc(sizeof(struct ncclDevrWindow)); + memset(win, 0, sizeof(*win)); + win->memory = mem; + win->size = userSize; + win->bigOffset = mem->bigOffset + memOffset; + win->winFlags = winFlags; + win->localRegHandle = localReg; + if (userPtr == nullptr) { + // Null means caller has no VA and will use the lsa team flat VA address. + win->userPtr = (char*)devr->lsaFlatBase + (devr->lsaSelf*devr->bigSize) + mem->bigOffset; + } else { + win->userPtr = userPtr; + } + + struct ncclWindow_vidmem* winDev; + struct ncclWindow_vidmem* winDevHost; + NCCLCHECK(ncclShadowPoolAlloc(&devr->shadows, &winDev, &winDevHost, stream)); + win->vidmem = winDev; + winDevHost->lsaFlatBase = (char*)devr->lsaFlatBase + win->bigOffset; + winDevHost->mcOffset4K = win->bigOffset>>12; + winDevHost->stride4G = devr->bigSize>>32; + winDevHost->lsaRank = devr->lsaSelf; + winDevHost->worldRank = comm->rank; + winDevHost->winHost = (void*)win; + CUDACHECK(cudaMemcpyAsync(winDev, winDevHost, sizeof(struct ncclWindow_vidmem), cudaMemcpyHostToDevice, stream)); + + NCCLCHECK(symWindowTableInitOnce(comm, stream)); // ensure devr->windowTable exists + struct ncclDevCommWindowTable* tableDev = devr->windowTable; + struct ncclDevCommWindowTable* tableHost; + NCCLCHECK(ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost)); + while (true) { + int i = 0; + while (i < 32 && tableHost->entries[i].window != nullptr) i += 1; + if (i < 32) { + tableHost->entries[i].base = userAddr; + tableHost->entries[i].size = userAddr + userSize; + tableHost->entries[i].window = winDev; + CUDACHECK(cudaMemcpyAsync(&tableDev->entries[i], &tableHost->entries[i], sizeof(tableHost->entries[i]), cudaMemcpyHostToDevice, stream)); + break; + } + if (tableHost->next == nullptr) { + NCCLCHECK(ncclShadowPoolAlloc(&devr->shadows, &tableHost->next, nullptr, stream)); + CUDACHECK(cudaMemcpyAsync(&tableDev->next, &tableHost->next, sizeof(tableHost->next), cudaMemcpyHostToDevice, stream)); + } + tableDev = tableHost->next; + NCCLCHECK(ncclShadowPoolToHost(&devr->shadows, tableHost->next, &tableHost)); + } + + { // insert into winSorted[] + int i = listFindSortedLub(&ncclDevrWindowSorted::userAddr, devr->winSorted, devr->winSortedCount, userAddr); + struct ncclDevrWindowSorted winSort; + winSort.userAddr = userAddr; + winSort.size = userSize; + winSort.win = win; + listInsert(&devr->winSorted, &devr->winSortedCapacity, &devr->winSortedCount, i, winSort); + } + + if (outWinDev) *outWinDev = winDev; + if (outWin) *outWin = win; + return ncclSuccess; +} + +static ncclResult_t symWindowDestroy(struct ncclComm* comm, struct ncclWindow_vidmem* winDev, cudaStream_t stream) { + ncclResult_t ret = ncclSuccess; + struct ncclDevrState* devr = &comm->devrState; + struct ncclWindow_vidmem* winDevHost; + struct ncclDevrWindow* winHost; + + NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, winDev, &winDevHost), ret, fail); + winHost = (struct ncclDevrWindow*)winDevHost->winHost; + + symMemoryDropRef(comm, winHost->memory); + + { struct ncclDevCommWindowTable* tableDev = devr->windowTable; + struct ncclDevCommWindowTable* tableHost; + NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost), ret, remove_winSorted); + while (true) { + int i = 0; + while (i < 32 && tableHost->entries[i].window != winDev) i += 1; + if (i < 32) { + memset(&tableHost->entries[i], 0, sizeof(tableHost->entries[i])); + CUDACHECKGOTO(cudaMemsetAsync(&tableDev->entries[i], 0, sizeof(tableDev->entries[i]), stream), ret, remove_winSorted); + break; + } + if (tableHost->next == nullptr) break; // Error didn't find window in table + tableDev = tableHost->next; + NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, tableHost->next, &tableHost), ret, remove_winSorted); + } + } + NCCLCHECKGOTO(ncclShadowPoolFree(&devr->shadows, winDev, stream), ret, remove_winSorted); + + NCCLCHECKGOTO(ncclCommDeregister(comm, winHost->localRegHandle), ret, remove_winSorted); + +remove_winSorted: + { int i = listFindSortedLub(&ncclDevrWindowSorted::userAddr, devr->winSorted, devr->winSortedCount, reinterpret_cast(winHost->userPtr)); + i -= 1; // least upper bound is just after ours. + listRemove(devr->winSorted, &devr->winSortedCount, i); + } + free(winHost); +fail: + return ret; +} + +ncclResult_t ncclDevrWindowRegisterInGroup( + struct ncclComm* comm, + void* userPtr, size_t userSize, int winFlags, ncclWindow_t* outWinDev + ) { + ncclResult_t ret = ncclSuccess; + CUdeviceptr memAddr = 0; + size_t memSize = 0; + CUmemGenericAllocationHandle memHandle = 0x0; + size_t memOffset; + struct ncclDevrMemory* mem = nullptr; + cudaStream_t stream = nullptr; + void* localRegHandle = nullptr; + + NCCLCHECKGOTO(ncclCommRegister(comm, userPtr, userSize, &localRegHandle), ret, fail); + + if (!comm->symmetricSupport) { + // We just return the local registration handle directly in this case, as there's no reason to allocate the + // ncclWindow_vidmem structure on the device, etc. + *outWinDev = reinterpret_cast(localRegHandle); + return ncclSuccess; + } + if (winFlags & NCCL_WIN_COLL_SYMMETRIC) { + // Defer symmetric kernel init until at least one window with that flag exists. + NCCLCHECKGOTO(ncclSymkInitOnce(comm), ret, fail); + } + + // Get underlying cumem handle: + CUCHECKGOTO(cuMemGetAddressRange(&memAddr, &memSize, reinterpret_cast(userPtr)), ret, fail_locReg); + memOffset = reinterpret_cast(userPtr) - memAddr; + if (memOffset%NCCL_WIN_REQUIRED_ALIGNMENT != 0) { + WARN("Window address must be suitably aligned."); + ret = ncclInvalidArgument; + goto fail; + } + + CUCHECKGOTO(cuMemRetainAllocationHandle(&memHandle, reinterpret_cast(memAddr)), ret, fail_locReg); + + // Trade cumem handle for ncclDevrMemory* + NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, memSize, &mem), ret, fail_locReg_memHandle); + memHandle = 0x0; // symMemoryObtain took our reference + + CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), ret, fail); + + NCCLCHECKGOTO(symWindowCreate( + comm, mem, memOffset, userPtr, userSize, winFlags, localRegHandle, outWinDev, nullptr, stream + ), ret, fail_locReg_memHandle_mem_stream); + mem = nullptr; // symWindowCreate took our reference + + CUDACHECKGOTO(cudaStreamSynchronize(stream), ret, fail_locReg_memHandle_mem_stream_win); + + // symWindowCreate needs barrier. + NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->rank, comm->nRanks, 0xbeef), ret, fail_locReg_memHandle_mem_stream_win); + + cudaStreamDestroy(stream); + return ret; + +fail_locReg_memHandle_mem_stream_win: + symWindowDestroy(comm, *outWinDev, stream); + *outWinDev = nullptr; + cudaStreamSynchronize(stream); +fail_locReg_memHandle_mem_stream: + cudaStreamDestroy(stream); + symMemoryDropRef(comm, mem); +fail_locReg_memHandle: + if (memHandle != 0x0) { CUCHECKIGNORE(cuMemRelease(memHandle)); } +fail_locReg: + ncclCommDeregister(comm, localRegHandle); +fail: + *outWinDev = nullptr; + return ret; +} + +static ncclResult_t deepCopyDevCommRequirements( + struct ncclDevCommRequirements const* src, + struct ncclDevCommRequirements** dst +) { + ncclResult_t ret = ncclSuccess; + struct ncclDevResourceRequirements **dstRes; + struct ncclTeamRequirements **dstTeam; + + NCCLCHECK(ncclCalloc(dst, 1)); + + /* copy the entire struct now and update linked lists later */ + **dst = *src; + + dstRes = &(*dst)->resourceRequirementsList; + for (struct ncclDevResourceRequirements* rr = src->resourceRequirementsList; rr != nullptr; rr = rr->next) { + NCCLCHECKGOTO(ncclCalloc(dstRes, 1), ret, fail); + (*dstRes)->bufferSize = rr->bufferSize; + (*dstRes)->bufferAlign = rr->bufferAlign; + (*dstRes)->outBufferHandle = rr->outBufferHandle; + dstRes = &(*dstRes)->next; + } + + dstTeam = &(*dst)->teamRequirementsList; + for (struct ncclTeamRequirements* tr = src->teamRequirementsList; tr != nullptr; tr = tr->next) { + NCCLCHECKGOTO(ncclCalloc(dstTeam, 1), ret, fail); + (*dstTeam)->team = tr->team; + (*dstTeam)->multimem = tr->multimem; + (*dstTeam)->outMultimemHandle = tr->outMultimemHandle; + dstTeam = &(*dstTeam)->next; + } + +exit: + return ret; +fail: + freeDevCommRequirements(*dst); + *dst = nullptr; + goto exit; +} + +void freeDevCommRequirements( + struct ncclDevCommRequirements* reqs +) { + if (reqs) { + while (reqs->resourceRequirementsList) { + struct ncclDevResourceRequirements* rr_next = reqs->resourceRequirementsList->next; + free(reqs->resourceRequirementsList); + reqs->resourceRequirementsList = rr_next; + } + + while (reqs->teamRequirementsList) { + struct ncclTeamRequirements* tr_next = reqs->teamRequirementsList->next; + free(reqs->teamRequirementsList); + reqs->teamRequirementsList = tr_next; + } + + free(reqs); + } +} + +ncclResult_t ncclDevrCommCreateInternal( + struct ncclComm* comm, + struct ncclDevCommRequirements const* reqs, struct ncclDevComm* outDevComm + ) { + ncclResult_t ret = ncclSuccess; + struct ncclDevrState* devr = &comm->devrState; + struct ncclTeam world = ncclTeamWorld(comm); + struct ncclTeam lsa = ncclTeamInnerFactor(world, devr->lsaSize); + struct ncclDevrTeam* tmLsa; + size_t bufSizeTotal; + struct ncclDevResourceRequirements* resReqsHead; + struct ncclDevResourceRequirements lsaBarReq; + cudaStream_t stream = nullptr; + CUmemGenericAllocationHandle memHandle = 0x0; + struct ncclDevrMemory* mem = nullptr; + struct ncclDevrWindow* win = nullptr; + struct ncclWindow_vidmem* winHost = nullptr; + + memset(outDevComm, 0, sizeof(*outDevComm)); + outDevComm->rank = comm->rank; + outDevComm->nRanks = comm->nRanks; + outDevComm->nRanks_rcp32 = idivRcp32(comm->nRanks); + outDevComm->lsaRank = devr->lsaSelf; + outDevComm->lsaSize = devr->lsaSize; + outDevComm->lsaSize_rcp32 = idivRcp32(devr->lsaSize); + + NCCLCHECKGOTO(symTeamObtain(comm, lsa, /*multicast=*/reqs->lsaMultimem, &tmLsa), ret, fail); + outDevComm->lsaMultimem.mcBasePtr = tmLsa->mcBasePtr; + + { struct ncclTeamRequirements* tr = reqs->teamRequirementsList; + while (tr != nullptr) { + if (tr->multimem) { + struct ncclDevrTeam* tm; + NCCLCHECKGOTO(symTeamObtain(comm, tr->team, tr->multimem, &tm), ret, fail); + if (tr->outMultimemHandle != nullptr) tr->outMultimemHandle->mcBasePtr = tm->mcBasePtr; + } + tr = tr->next; + } + } + + resReqsHead = reqs->resourceRequirementsList; + + ncclLsaBarrierCreateRequirement(lsa, reqs->lsaBarrierCount, &outDevComm->lsaBarrier, &lsaBarReq); + lsaBarReq.next = resReqsHead; + resReqsHead = &lsaBarReq; + + { struct ncclDevResourceRequirements* rr = resReqsHead; + bufSizeTotal = 0; + while (rr != nullptr) { + bufSizeTotal = alignUp(bufSizeTotal, std::max(128, rr->bufferAlign)); + if (rr->outBufferHandle != nullptr) *rr->outBufferHandle = bufSizeTotal/128; + bufSizeTotal += rr->bufferSize; + rr = rr->next; + } + bufSizeTotal = alignUp(bufSizeTotal, devr->granularity); + } + + CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), ret, fail); + + NCCLCHECKGOTO(symWindowTableInitOnce(comm, stream), ret, fail); // ensure devr->windowTable exists + outDevComm->windowTable = comm->devrState.windowTable; + + if (bufSizeTotal == 0) { + outDevComm->resourceWindow = nullptr; + outDevComm->resourceWindow_inlined = {}; + } else { + CUmemAllocationProp memProp = {}; + memProp.type = CU_MEM_ALLOCATION_TYPE_PINNED; + memProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + memProp.requestedHandleTypes = ncclCuMemHandleType; + memProp.location.id = comm->cudaDev; + + CUCHECKGOTO(cuMemCreate(&memHandle, bufSizeTotal, &memProp, 0), ret, fail); + + NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, bufSizeTotal, &mem), ret, fail); + memHandle = 0x0; // Reference given to symMemoryObtain + + NCCLCHECKGOTO(symWindowCreate( // Requires world barrier afterward. + comm, mem, /*memOffset=*/0, nullptr, bufSizeTotal, /*winFlags=*/0, + /*localReg=*/nullptr, &outDevComm->resourceWindow, &win, + stream), ret, fail); + mem = nullptr; // Reference given to symWindowCreate + NCCLCHECKGOTO(ncclShadowPoolToHost(&comm->devrState.shadows, win->vidmem, &winHost), ret, fail); + outDevComm->resourceWindow_inlined = *winHost; + + CUDACHECKGOTO(cudaMemsetAsync(win->userPtr, 0, bufSizeTotal, stream), ret, fail); + } + + CUDACHECKGOTO(cudaStreamSynchronize(stream), ret, fail); + + NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->rank, comm->nRanks, 0xbeef), ret, fail); + + cudaStreamDestroy(stream); + return ret; + +fail: + if (win != nullptr) { + symWindowDestroy(comm, win->vidmem, stream); + cudaStreamSynchronize(stream); + } + if (mem != nullptr) { + symMemoryDropRef(comm, mem); + } + if (memHandle != 0x0) { + CUCHECKIGNORE(cuMemRelease(memHandle)); + } + if (stream != nullptr) { + cudaStreamDestroy(stream); + } + return ret; +} + +//////////////////////////////////////////////////////////////////////////////// + +NCCL_API(ncclResult_t, ncclCommWindowRegister, ncclComm_t comm, void* ptr, size_t size, ncclWindow_t* win, int winFlags); +ncclResult_t ncclCommWindowRegister( + struct ncclComm* comm, void* userPtr, size_t userSize, + struct ncclWindow_vidmem** outWinDev, int winFlags + ) { + ncclResult_t ret = ncclSuccess; + int saveDev; + struct ncclDevrRegTask* task; + + CUDACHECK(cudaGetDevice(&saveDev)); + NCCLCHECK(ncclGroupStartInternal()); + + if (userPtr == nullptr || userSize == 0 || !(comm->symmetricSupport || ncclParamLocalRegister())) goto exit; + + NCCLCHECKGOTO(ncclCommEnsureReady(comm), ret, fail); + CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail); + + NCCLCHECKGOTO(ncclDevrInitOnce(comm), ret, fail); + + NCCLCHECKGOTO(ncclCalloc(&task, 1), ret, fail); + task->userPtr = userPtr; + task->userSize = userSize; + task->winFlags = winFlags; + task->outWinDev = outWinDev; + ncclIntruQueueEnqueue(&comm->devrState.regTaskQueue, task); + ncclGroupCommJoin(comm, ncclGroupTaskTypeSymRegister); + +exit: + ncclGroupErrCheck(ret); + NCCLCHECK(ncclGroupEndInternal()); + cudaSetDevice(saveDev); + return ret; +fail: + goto exit; +} + +NCCL_API(ncclResult_t, ncclCommWindowDeregister, ncclComm_t comm, ncclWindow_t win); +ncclResult_t ncclCommWindowDeregister(struct ncclComm* comm, struct ncclWindow_vidmem* winDev) { + ncclResult_t ret = ncclSuccess; + int saveDev; + cudaStream_t stream; + + if (winDev == nullptr) goto exit; + + if (!comm->symmetricSupport) { + NCCLCHECKGOTO(ncclCommDeregister(comm, winDev), ret, fail); + goto exit; + } + CUDACHECKGOTO(cudaGetDevice(&saveDev), ret, fail); + CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail); + CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), ret, fail_dev); + NCCLCHECKGOTO(symWindowDestroy(comm, winDev, stream), ret, fail_dev_stream); +fail_dev_stream: + cudaStreamSynchronize(stream); + cudaStreamDestroy(stream); +fail_dev: + cudaSetDevice(saveDev); +fail: +exit: + return ret; +} + +ncclResult_t ncclDevrFindWindow( + struct ncclComm* comm, void const* userPtr, struct ncclDevrWindow** outWin + ) { + struct ncclDevrState* devr = &comm->devrState; + uintptr_t userAddr = reinterpret_cast(userPtr); + int i = listFindSortedLub(&ncclDevrWindowSorted::userAddr, devr->winSorted, devr->winSortedCount, userAddr); + if (0 < i && (userAddr - devr->winSorted[i-1].userAddr < devr->winSorted[i-1].size)) { + *outWin = devr->winSorted[i-1].win; + } else { + *outWin = nullptr; + } + return ncclSuccess; +} + +NCCL_API(ncclResult_t, ncclDevCommCreate, ncclComm_t comm, ncclDevCommRequirements_t const* reqs, ncclDevComm_t* outDevComm); +ncclResult_t ncclDevCommCreate( + ncclComm_t comm, struct ncclDevCommRequirements const* reqs, + struct ncclDevComm* outDevComm + ) { + ncclResult_t ret = ncclSuccess; + int saveDev; + struct ncclDevrCommCreateTask* task = nullptr; + + CUDACHECK(cudaGetDevice(&saveDev)); + NCCLCHECK(ncclGroupStartInternal()); + + if (!comm->symmetricSupport) { + WARN("Communicator does not support symmetric memory!"); + ret = ncclInvalidUsage; + goto fail; + } + + NCCLCHECKGOTO(ncclCommEnsureReady(comm), ret, fail); + CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail); + + NCCLCHECKGOTO(ncclDevrInitOnce(comm), ret, fail); + + NCCLCHECKGOTO(ncclCalloc(&task, 1), ret, fail); + // reqs must be deep copied to the task so background threads can safely access it + NCCLCHECKGOTO(deepCopyDevCommRequirements(reqs, &task->reqs), ret, fail); + task->outDevComm = outDevComm; + ncclIntruQueueEnqueue(&comm->devrState.commCreateTaskQueue, task); + ncclGroupCommJoin(comm, ncclGroupTaskTypeSymRegister); + +exit: + ncclGroupErrCheck(ret); + NCCLCHECK(ncclGroupEndInternal()); + cudaSetDevice(saveDev); + return ret; +fail: + free(task); + goto exit; +} + +NCCL_API(ncclResult_t, ncclDevCommDestroy, ncclComm_t comm, ncclDevComm_t const* devComm); +ncclResult_t ncclDevCommDestroy( + struct ncclComm* comm, struct ncclDevComm const* devComm + ) { + //struct ncclDevrState* devr = &comm->devrState; + if (devComm->resourceWindow != nullptr) { + NCCLCHECK(ncclCommWindowDeregister(comm, devComm->resourceWindow)); + } + return ncclSuccess; +} + + +// Get the corresponding pointer in another lsa rank's symmetric memory window +ncclResult_t ncclDevrGetLsaRankPtr(struct ncclComm* comm, struct ncclDevrWindow* winHost, size_t offset, int lsaRank, void** outPtr) { + if (winHost == nullptr || outPtr == nullptr) { + return ncclInvalidArgument; + } + + struct ncclDevrState* devr = &comm->devrState; + + // Validate lsaRank is within bounds + if (lsaRank < 0 || lsaRank >= devr->lsaSize) { + return ncclInvalidArgument; + } + + // Validate offset is within bounds + if (offset < 0 || offset >= winHost->size) { + return ncclInvalidArgument; + } + + // Calculate the address with offset for the specified lsa rank + *outPtr = (void*)((uintptr_t)devr->lsaFlatBase + lsaRank * devr->bigSize + winHost->bigOffset + offset); + return ncclSuccess; +} + +// Get the multicast address for a given team +ncclResult_t ncclDevrGetLsaTeamPtrMC(struct ncclComm* comm, struct ncclDevrWindow* winHost, size_t offset, struct ncclTeam lsaTeam, void** outPtr){ + if (winHost == nullptr || outPtr == nullptr) { + return ncclInvalidArgument; + } + + if (!comm->nvlsSupport) { + return ncclInvalidUsage; + } + + bool multimem = true; + struct ncclDevrTeam* tm; + NCCLCHECK(symTeamObtain(comm, lsaTeam, multimem, &tm)); + + // Return the base multicast address for this team with offset + *outPtr = (void*)((uintptr_t)tm->mcBasePtr + winHost->bigOffset + offset); + return ncclSuccess; +} + +//////////////////////////////////////////////////////////////////////////////// + +// Find the least index strictly greater than arg. +template +static int listFindSortedLub(Key Obj::*key, Obj* sorted, int count, Key arg) { + int lo = 0, hi = count; + while (lo + 16 < hi) { + int i = (lo + hi)/2; + if (sorted[i].*key <= arg) lo = i+1; + else hi = i; + } + int i = lo; + while (i < hi && sorted[i].*key <= arg) i++; + return i; +} + +template +static void listInsert(Obj** list, int* capacity, int* count, int index, Obj val) { + if (*capacity < *count + 1) { + *capacity *= 2; + if (*capacity == 0) *capacity = 16; + *list = (Obj*)realloc(*list, (*capacity)*sizeof(Obj)); + } + for (int j = *count; j != index; j--) { + (*list)[j] = (*list)[j-1]; + } + (*list)[index] = val; + *count += 1; +} + +template +static void listRemove(Obj* list, int* count, int index) { + for (int i = index; i+1 < *count; i++) { + list[i] = list[i+1]; + } + *count -= 1; +} + diff --git a/src/device/CMakeLists.txt b/src/device/CMakeLists.txt new file mode 100644 index 000000000..98447428d --- /dev/null +++ b/src/device/CMakeLists.txt @@ -0,0 +1,60 @@ +# Run the scripts once during configuration to get the file lists +execute_process( + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc "${ONLY_FUNCS}" + OUTPUT_VARIABLE files + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} +) +string(STRIP "${files}" files) +list(TRANSFORM files PREPEND ${CMAKE_CURRENT_BINARY_DIR}/gensrc/) + +execute_process( + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric "${ONLY_FUNCS}" + OUTPUT_VARIABLE symmetric_files + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} +) +string(STRIP "${symmetric_files}" symmetric_files) +list(TRANSFORM symmetric_files PREPEND ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric/) + +# Create custom commands to generate source files with proper dependencies +add_custom_command( + OUTPUT ${files} + BYPRODUCTS ${files} + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc "${ONLY_FUNCS}" + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/generate.py + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + COMMENT "Generating device source files" +) + +add_custom_command( + OUTPUT ${symmetric_files} + BYPRODUCTS ${symmetric_files} + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric "${ONLY_FUNCS}" + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + COMMENT "Generating symmetric device source files" +) + +# Add library target +add_library(nccl_device OBJECT + ${files} + ${symmetric_files} + ${CMAKE_CURRENT_SOURCE_DIR}/common.cu + ${CMAKE_CURRENT_SOURCE_DIR}/onerank.cu +) + +set_target_properties(nccl_device PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON +) + +# Set include directories for the target +target_include_directories(nccl_device PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_SOURCE_DIR}/src/include + ${CMAKE_SOURCE_DIR}/src/include/plugin + ${CMAKE_BINARY_DIR}/include + ${CUDAToolkit_INCLUDE_DIRS} + ${CUDAToolkit_INCLUDE_DIRS}/cccl +) + +add_dependencies(nccl_device nccl_header) diff --git a/src/device/Makefile b/src/device/Makefile index 67ab176ca..fd8f2759d 100644 --- a/src/device/Makefile +++ b/src/device/Makefile @@ -19,7 +19,7 @@ OBJDIR := $(BUILDDIR)/obj/device MANIFEST := $(OBJDIR)/manifest DEVGLUE_OBJ := $(OBJDIR)/device_glue.o -INCFLAGS = -I. -I.. -I$(BUILDDIR)/include -I../include +INCFLAGS = -I. -I.. -I$(BUILDDIR)/include -I../include -I../include/plugin NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden" CXXFLAGS += $(INCFLAGS) @@ -47,7 +47,11 @@ endif define COMPILE_SYM @$(SAY) "Compiling" $2;\ mkdir -p $(dir $1);\ - $(NVCC) $(NVCUFLAGS_SYM) $3 -dw $2 -o $1 + if [[ -n "$3" ]]; then\ + $(NVCC) $(NVCUFLAGS_SYM) $3 -dw $2 -o $1;\ + else\ + touch $2.empty.cu; $(NVCC) $(NVCUFLAGS_SYM) -dw $2.empty.cu -o $1; rm $2.empty.cu;\ + fi endef DEPENDS.cu = $(NVCC) $(NVCUFLAGS) -M -dc $1 diff --git a/src/device/common.h b/src/device/common.h index a2884b50c..a31cf5f8e 100644 --- a/src/device/common.h +++ b/src/device/common.h @@ -43,7 +43,7 @@ struct ncclShmemData { struct ncclDevKernelArgs args; int channelId; int aborted; - alignas(16) struct ncclDevComm comm; + alignas(16) struct ncclKernelComm comm; alignas(16) struct ncclDevChannel channel; int batchIx, nextBatchIx; @@ -323,7 +323,7 @@ __device__ __forceinline__ void profiler(int action) { ncclShmem.comm.workCompleted[ncclShmem.channelId].data[wc%MAX_PROFILER_EVENTS_PER_CHANNEL].counter = wc; } ncclShmem.channel.workCounter += ncclShmem.nWorks; - if (action == FINI) ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter; + if (action == FINI) ((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter; } } } @@ -351,7 +351,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a /* set abort flag to 0 */ if (tid == 0) { ncclShmem.aborted = 0; - ncclShmem.channel.workCounter = ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter; + ncclShmem.channel.workCounter = ((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter; } // Use first 2 warps to load comm and channel, and remaining load work batch. @@ -359,14 +359,14 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a case 0: { void* dst = &ncclShmem.comm; void* src = ncclShmem.args.comm; - int bytes = sizeof(ncclDevComm); - static_assert(sizeof(ncclDevComm) <= 16*WARP_SIZE, "ncclDevComm cannot be loaded by a single warp in one insn."); + int bytes = sizeof(ncclKernelComm); + static_assert(sizeof(ncclKernelComm) <= 16*WARP_SIZE, "ncclKernelComm cannot be loaded by a single warp in one insn."); copyToShmem16(tid, dst, src, bytes); } break; case 1: - { // Get address of channel without incurring indirect load from ncclDevComm::channels + { // Get address of channel without incurring indirect load from ncclKernelComm::channels void* dst = &ncclShmem.channel; - void* src = &((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId]; + void* src = &((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId]; int bytes = sizeof(ncclDevChannel); static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn."); copyToShmem16(tid-WARP_SIZE, dst, src, bytes); diff --git a/src/device/generate.py b/src/device/generate.py index f9c3a0e79..aefba9422 100755 --- a/src/device/generate.py +++ b/src/device/generate.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import os import sys +import shutil # Order of redops, tys, protos, algos must match src/include/device.h all_colls = ["Broadcast","Reduce","AllGather","ReduceScatter","AllReduce","SendRecv"] @@ -17,8 +18,11 @@ if os.path.exists(gensrc): for name in os.listdir(gensrc): - os.remove(os.path.join(gensrc, name)) - #os.truncate(os.path.join(gensrc, name), 0) + path = os.path.join(gensrc, name) + if os.path.isfile(path): + os.remove(path) + elif os.path.isdir(path): + shutil.rmtree(path) else: os.mkdir(gensrc) @@ -322,6 +326,16 @@ def partition_by_name(fns): name_to_funcs = partition_by_name(fn for fn in primary_funcs if fn[0]!="Nop") name_to_kernels = partition_by_name(kfn for kfn in kernel_funcs if kfn[0]!="Generic") +files = "" +for name in sorted(name_to_funcs.keys()): + files += name + ";" +files += "device_table.cu;" +files += "host_table.cc" + +# Do not print files when running make +if os.environ.get("NCCL_USE_CMAKE", "0") == "1": + print(files) + # Generate /rules.mk with open(os.path.join(gensrc, "rules.mk"), "w") as f: out = f.write diff --git a/src/device/symmetric/all_gather.cuh b/src/device/symmetric/all_gather.cuh index 8f81347ec..9f050836c 100644 --- a/src/device/symmetric/all_gather.cuh +++ b/src/device/symmetric/all_gather.cuh @@ -1,32 +1,33 @@ -#include "symmetric.h" -#include "symmetric/kernel.cuh" -#include "symmetric/primitives.cuh" +#include "sym_kernels.h" +#include "kernel.cuh" +#include "primitives.cuh" template static __device__ void bcastDeep( - ncclSymPrims& prim, int tn, int t, bool waitNeeded, - char* inputHere, char* outputRank0, bool inPlace, int nIters + ncclSymkArgsHandler const& handler, int tn, int t, + bool waitNeeded, ncclLsaBarrierSession& bar, + ncclSymPtr input, ncclSymPtr output, bool inPlace, int nIters ) { using Pack = BytePack; int wn = tn/WARP_SIZE; int w = t/WARP_SIZE; int lane = t%WARP_SIZE; - int const& rank = prim.rank; - int const& nRanks = prim.nRanks; - uint32_t const& stride4G = prim.stride4G; - Pack* inpHere = (Pack*)inputHere + intptr_t(w)*UnrollPacks*WARP_SIZE + lane; - Pack* outRank0 = (Pack*)outputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane; + int const& rank = handler.comm.rank; + int const& nRanks = handler.comm.nRanks; + + Pack* inpPacks = (Pack*)input.localPtr() + intptr_t(w)*UnrollPacks*WARP_SIZE + lane; + ncclSymPtr outPacks = (ncclSymPtr)output + intptr_t(w)*UnrollPacks*WARP_SIZE + lane; Pack tmp[UnrollPacks]; nIters -= w; if (0 < nIters) { #pragma unroll for (int u=0; u < UnrollPacks; u++) { - tmp[u] = inpHere[u*WARP_SIZE]; + tmp[u] = inpPacks[u*WARP_SIZE]; } } - if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed); if (0 < nIters) { while (true) { @@ -44,21 +45,21 @@ static __device__ void bcastDeep( if (partial && dr == nRanks) break; #pragma unroll UnrollPacks for (int u=0; u < UnrollPacks; u++) { - add4G(outRank0, r*stride4G)[u*WARP_SIZE] = tmp[u]; + outPacks.lsaPtr(r)[u*WARP_SIZE] = tmp[u]; } if (++r == nRanks) r = 0; } } } - inpHere += intptr_t(wn)*UnrollPacks*WARP_SIZE; - outRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE; + inpPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE; + outPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE; nIters -= wn; if (nIters <= 0) break; // Load data for next iteration. #pragma unroll for (int u=0; u < UnrollPacks; u++) { - tmp[u] = inpHere[u*WARP_SIZE]; + tmp[u] = inpPacks[u*WARP_SIZE]; } } } @@ -66,18 +67,17 @@ static __device__ void bcastDeep( template static __device__ void bcastEnds( - ncclSymPrims& prim, int tn, int t, - T* inputHere, T* outputRank0, bool inPlace, size_t nElts, uint32_t nPreElts, size_t nSufElts + ncclSymkArgsHandler const& handler, int tn, int t, + ncclSymPtr input, ncclSymPtr output, bool inPlace, size_t nElts, uint32_t nPreElts, size_t nSufElts ) { - int const& rank = prim.rank; - int const& nRanks = prim.nRanks; - uint32_t const& stride4G = prim.stride4G; - BytePack* inpHere = (BytePack*)inputHere; - BytePack* outRank0 = (BytePack*)outputRank0; + int const& rank = handler.comm.rank; + int const& nRanks = handler.comm.nRanks; + BytePack* inpPacks = (BytePack*)input.localPtr(); + ncclSymPtr> outPacks = (ncclSymPtr>)output; #pragma unroll 1 for (size_t i = t; i < nPreElts+nSufElts; i += tn) { size_t elt = i < nPreElts ? i : nElts-nPreElts-nSufElts+i; - BytePack tmp = inpHere[elt]; + BytePack tmp = inpPacks[elt]; int dr = inPlace ? 1 : 0; int r = rank + dr; if (r == nRanks) r = 0; @@ -85,14 +85,14 @@ static __device__ void bcastEnds( for (; dr + UnrollPeers <= nRanks; dr += UnrollPeers) { #pragma unroll UnrollPeers for (int u=0; u < UnrollPeers; u++) { - *add4G(outRank0+elt, r*stride4G) = tmp; + outPacks.lsaPtr(r)[elt] = tmp; if (++r == nRanks) r = 0; } } #pragma unroll UnrollPeers for (int u=0; u < UnrollPeers; u++) { if (dr+u == nRanks) break; - *add4G(outRank0+elt, r*stride4G) = tmp; + outPacks.lsaPtr(r)[elt] = tmp; if (++r == nRanks) r = 0; } } @@ -100,95 +100,99 @@ static __device__ void bcastEnds( template static __device__ void bcast( - ncclSymPrims& prim, int tn, int t, bool waitNeeded, T* input, T* output, size_t nElts + ncclSymkArgsHandler const& handler, int tn, int t, int nBlocks, + bool waitNeeded, ncclLsaBarrierSession& bar, + ncclSymPtr input, ncclSymPtr output, size_t nElts ) { bool inPlace = (input == output); - // Mpve to rank=0 - output = prim.peerPtr(0, output); - - uintptr_t inputUptr = reinterpret_cast(input); - uintptr_t outputUptr = reinterpret_cast(output); size_t nBytes = nElts*sizeof(T); + uint32_t nBlocks_rcp32 = nccl::utility::idivRcp32_upto64(nBlocks); - uint32_t nPreBytes = (128u - inputUptr)%128u; + uint32_t nPreBytes = (16 - input.offset)%16; nPreBytes = min((size_t)nPreBytes, nBytes); uintptr_t cursor = nPreBytes; constexpr int MinWarpPerBlock = 4; - if ((inputUptr-outputUptr)%16 == 0) { + if ((input.offset - output.offset)%16 == 0) { constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2; constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack; uint32_t chunks = (nBytes-cursor)/BytePerChunk; - chunks -= imodFast32(chunks, prim.nBlocks, prim.nBlocks_rcp32); + chunks -= imodFast32(chunks, nBlocks, nBlocks_rcp32); if (chunks != 0) { uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk; bcastDeep( - prim, tn, t, waitNeeded, - (char*)input + cursor, (char*)output + cursor, inPlace, - chunks*MinWarpPerBlock + handler, tn, t, waitNeeded, bar, + (ncclSymPtr)input + cursor, + (ncclSymPtr)output + cursor, + inPlace, chunks*MinWarpPerBlock ); cursor = cursorAfter; waitNeeded = false; } } - if (sizeof(T) == 4 || (sizeof(T) < 4 && (inputUptr-outputUptr)%4 == 0)) { + if (sizeof(T) == 4 || (sizeof(T) < 4 && (input.offset - output.offset)%4 == 0)) { constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4; constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack; uint32_t chunks = (nBytes-cursor)/BytePerChunk; - chunks -= imodFast32(chunks, prim.nBlocks, prim.nBlocks_rcp32); + chunks -= imodFast32(chunks, nBlocks, nBlocks_rcp32); if (chunks != 0) { uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk; bcastDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers>( - prim, tn, t, waitNeeded, - (char*)input + cursor, (char*)output + cursor, inPlace, - chunks*MinWarpPerBlock + handler, tn, t, waitNeeded, bar, + (ncclSymPtr)input + cursor, + (ncclSymPtr)output + cursor, + inPlace, chunks*MinWarpPerBlock ); cursor = cursorAfter; waitNeeded = false; } } - if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed); constexpr int UnrollPeers = 8; size_t nSufElts = (nBytes-cursor)/sizeof(T); - bcastEnds(prim, tn, t, input, output, inPlace, nElts, nPreBytes/sizeof(T), nSufElts); + bcastEnds(handler, tn, t, input, output, inPlace, nElts, nPreBytes/sizeof(T), nSufElts); } -__device__ __forceinline__ void ncclSymRun_AllGather_ST(ncclSymDevArgs const* args) { - ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier); - int const& rank = prim.rank; +__device__ __forceinline__ void ncclSymkRun_AllGather_ST(ncclSymkDevWorkArgs const* args) { + ncclSymkArgsHandler handler{args}; + ncclLsaBarrierSession bar{ + ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x + }; + int const& rank = handler.comm.rank; - // Threads numbered over rank. - int bt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE, - prim.block, prim.nBlocks, - threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); - int btn = prim.nBlocks*blockDim.x; + bar.arrive(ncclCoopCta(), cuda::memory_order_relaxed); - prim.barrierArrive(ncclCoopCta(), /*release=*/false); - //prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + bool waitNeeded = true; + handler.forEachWork( + [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts, + ncclSymPtr input, ncclSymPtr output) { + // Threads numbered over rank. + int bt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE, + block, nBlocks, + threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); + int btn = nBlocks*blockDim.x; - bcast(prim, btn, bt, /*waitNeeded=*/true, (char*)args->input, (char*)args->output + rank*args->nElts, args->nElts); + bcast(handler, btn, bt, nBlocks, waitNeeded, bar, input, output + rank*nAllElts, nElts); - prim.barrierArrive(ncclCoopCta(), /*release=*/true); - prim.barrierWait(ncclCoopCta(), /*acquire=*/false); -} + waitNeeded = false; + } + ); + bar.sync(ncclCoopCta(), cuda::memory_order_release); +} template static __device__ void bcastMultimem( - ncclSymPrims& prim, int tn, int t, T* input, T* output, size_t nElts + ncclSymkArgsHandler& handler, int tn, int t, ncclSymPtr input, ncclSymPtr output, size_t nElts ) { - // Move output to multimem - output = prim.multimemPtr(output); - - uintptr_t inputUptr = reinterpret_cast(input); - uintptr_t outputUptr = reinterpret_cast(output); size_t nBytes = nElts*sizeof(T); - - uint32_t nPreBytes = (16-inputUptr)%16; + uintptr_t inputUptr = reinterpret_cast(input.localPtr()); + uintptr_t outputUptr = reinterpret_cast(output.multimemPtr(handler.comm.lsaMultimem)); + uint32_t nPreBytes = (16 - input.offset)%16; nPreBytes = min((size_t)nPreBytes, nBytes); uintptr_t nSufBytes; @@ -227,51 +231,52 @@ static __device__ void bcastMultimem( uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes); BytePack val = *reinterpret_cast*>(inputUptr + cursor); multimem_st_global(outputUptr + cursor, val); - cursor += tn*sizeof(T); } } -__device__ __forceinline__ void ncclSymRun_AllGather_STMC(ncclSymDevArgs const* args) { - ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem); - int const& rank = prim.rank; - - char* input = args->input; - char* output = args->output; - size_t bytes = args->nElts; - // Round robin memory to blocks. - int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE, - prim.block, prim.nBlocks, - threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); - int tn = prim.nBlocks*blockDim.x; - - prim.barrierArrive(ncclCoopCta(), /*release=*/false); - prim.barrierWait(ncclCoopCta(), /*acquire=*/false); - - bcastMultimem(prim, tn, t, input, output + rank*bytes, bytes); +__device__ __forceinline__ void ncclSymkRun_AllGather_STMC(ncclSymkDevWorkArgs const* args) { + ncclSymkArgsHandler handler{args}; + ncclLsaBarrierSession bar( + ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x, /*multimem=*/true + ); + int const& rank = handler.comm.rank; + + bar.sync(ncclCoopCta(), cuda::memory_order_relaxed); + + handler.forEachWork( + [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts, + ncclSymPtr input, ncclSymPtr output) { + // Round robin memory to blocks. + int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE, + block, nBlocks, + threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); + int tn = nBlocks*blockDim.x; + + bcastMultimem(handler, tn, t, input, output + rank*nAllElts, nElts); + } + ); - prim.barrierArrive(ncclCoopCta(), /*release=*/true); - prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + bar.sync(ncclCoopCta(), cuda::memory_order_release); } template static __device__ void allgather_LL_body( - ncclSymPrims &prim, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts + ncclSymkArgsHandler& handler, ncclLLA2ASession& lla2a, + EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts ) { using Pack = BytePack<8>; constexpr int EltPerPack = 8/sizeof(EltType); - - ncclCoopCta cta; - int rank = prim.rank; - int nRanks = prim.nRanks; - constexpr int tn = ncclSymMaxThreads; + int const& rank = handler.comm.rank; + int const& nRanks = handler.comm.nRanks; int t = threadIdx.x; + constexpr int tn = ncclSymkMaxThreads; #pragma unroll 1 while (0 < nElts) { int nIterPacks = min(nPacks, tn); if (t < nIterPacks) { Pack x = loadPack(input, t*EltPerPack, nElts); - prim.bcastLL(/*slot=*/nIterPacks*rank + t, x); + lla2a.bcast(/*slot=*/nIterPacks*rank + t, x); } int tn_div_nPacks = tn/nIterPacks; @@ -284,7 +289,7 @@ static __device__ void allgather_LL_body( #pragma unroll 1 for (int i = t; i < (nRanks*nIterPacks & -(Unroll*tn)); i += Unroll*tn) { Pack got[Unroll]; - prim.template recvLL(i, Unroll, tn, /*&*/got); + lla2a.template recvUnrolled(i, Unroll, tn, /*&*/got); #pragma unroll for (int u=0; u < Unroll; u++) { storePack(output + peer*nStrideElts, pack*EltPerPack, nElts, got[u]); @@ -299,7 +304,7 @@ static __device__ void allgather_LL_body( if (i + n*tn < nRanks*nIterPacks) n += 1; if (n != 0) { Pack got[Unroll]; - prim.template recvLL<1, Unroll>(i, n, tn, /*&*/got); + lla2a.template recvUnrolled<1, Unroll>(i, n, tn, /*&*/got); #pragma unroll for (int u=0; u < Unroll; u++) { if (u != 0 && u == n) break; @@ -313,7 +318,7 @@ static __device__ void allgather_LL_body( // The non-unrolled but "obviously correct" implementation for reference. #pragma unroll 1 for (int i = t; i < nRanks*nIterPacks; i += tn) { - Pack got = prim.template recvLL(i); + Pack got = lla2a.template recv(i); storePack(output + peer*nStrideElts, pack*EltPerPack, nElts, got); peer += tn_div_nPacks; pack += tn_mod_nPacks; @@ -321,7 +326,7 @@ static __device__ void allgather_LL_body( } #endif - prim.endLL(cta); + lla2a.endEpoch(ncclCoopCta()); input += tn*EltPerPack; output += tn*EltPerPack; @@ -330,38 +335,41 @@ static __device__ void allgather_LL_body( } } -static __device__ void ncclSymRun_AllGather_LL_impl(ncclSymDevArgs const* args, bool multimem) { - ncclSymPrims prim(args->comm, ncclSymPrims_UseLL | multimem*ncclSymPrims_UseMultimem); +static __device__ void ncclSymkRun_AllGather_LL_impl(ncclSymkDevWorkArgs const* args, bool multimem) { + ncclSymkArgsHandler handler{args}; + ncclLLA2ASession lla2a( + ncclCoopCta(), handler.comm, ncclTeamLsa(handler.comm), handler.lsaLLA2A, blockIdx.x, /*maxElts=*/ncclSymkMaxThreads, multimem, handler.comm.lsaMultimem + ); + using Pack = BytePack<8>; constexpr int BytePerPack = 8; - int nElts = args->nElts; - int nPacks = divUp(nElts, BytePerPack); - - uint32_t nPackPerBlock, nPackModBlock; - idivmodFast32(&nPackPerBlock, &nPackModBlock, nPacks, prim.nBlocks, prim.nBlocks_rcp32); - int blockPackBegin = prim.block*nPackPerBlock + minval(prim.block, nPackModBlock); - int blockPackEnd = blockPackBegin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0); - int nBlockPacks = blockPackEnd - blockPackBegin; - int nBlockElts = nElts - blockPackBegin*BytePerPack; - nBlockElts = min(nBlockElts, nBlockPacks*BytePerPack); - char* blockInput = args->input + blockPackBegin*BytePerPack; - char* blockOutput = args->output + blockPackBegin*BytePerPack; - - uint32_t lowBits = args->nElts; - lowBits |= (uint32_t)reinterpret_cast(args->input); - lowBits |= (uint32_t)reinterpret_cast(args->output); - if (__builtin_expect(lowBits%8 == 0, true)) { - // NOTE: Specializing for 8-byte alignment in one case help at size=65K: 8.9us vs 5.6us - allgather_LL_body(prim, (BytePack<8>*)blockInput, (BytePack<8>*)blockOutput, nBlockElts/8, nBlockPacks, nElts/8); - } else { - allgather_LL_body(prim, blockInput, blockOutput, nBlockElts, nBlockPacks, nElts); - } + + handler.singleWork( + [&]__device__(int nElts, int nAllElts, + ncclSymPtr input, ncclSymPtr output) { + int nPacks = divUp(nElts, BytePerPack); + + char* blockInput = input.localPtr(); + char* blockOutput = output.localPtr(); + + uint32_t lowBits = nElts; + lowBits |= (uintptr_t)blockInput; + lowBits |= (uintptr_t)blockOutput; + if (__builtin_expect(lowBits%8 == 0, true)) { + // NOTE: Specializing for 8-byte alignment in one case help at size=65K: 8.9us vs 5.6us + allgather_LL_body(handler, lla2a, (BytePack<8>*)blockInput, (BytePack<8>*)blockOutput, + nElts/8, nPacks, nAllElts/8); + } else { + allgather_LL_body(handler, lla2a, blockInput, blockOutput, nElts, nPacks, nAllElts); + } + } + ); } -__device__ __forceinline__ void ncclSymRun_AllGather_LL(ncclSymDevArgs const* args) { - ncclSymRun_AllGather_LL_impl(args, /*multimem=*/false); +__device__ __forceinline__ void ncclSymkRun_AllGather_LL(ncclSymkDevWorkArgs const* args) { + ncclSymkRun_AllGather_LL_impl(args, /*multimem=*/false); } -__device__ __forceinline__ void ncclSymRun_AllGather_LLMC(ncclSymDevArgs const* args) { - ncclSymRun_AllGather_LL_impl(args, /*multimem=*/true); +__device__ __forceinline__ void ncclSymkRun_AllGather_LLMC(ncclSymkDevWorkArgs const* args) { + ncclSymkRun_AllGather_LL_impl(args, /*multimem=*/true); } diff --git a/src/device/symmetric/all_reduce.cuh b/src/device/symmetric/all_reduce.cuh index 6c5219784..94e40babb 100644 --- a/src/device/symmetric/all_reduce.cuh +++ b/src/device/symmetric/all_reduce.cuh @@ -1,35 +1,38 @@ -#include "symmetric.h" -#include "symmetric/kernel.cuh" -#include "symmetric/primitives.cuh" +#include "sym_kernels.h" +#include "nccl_device.h" +#include "kernel.cuh" +#include "primitives.cuh" template static __device__ __forceinline__ void allreduceDeep( - ncclSymPrims& prim, int tn, int t, bool waitNeeded, - Red red, char* inputRank0, char* outputRank0, int32_t nIters + ncclSymkArgsHandler const& handler, int tn, int t, + bool waitNeeded, ncclLsaBarrierSession& bar, + Red red, ncclSymPtr input, ncclSymPtr output, int32_t nIters ) { using Pack = BytePack; using Acc = typename Red::EltType; using AccPack = BytePack; + ncclTeam world = ncclTeamWorld(handler.comm); int wn = tn/WARP_SIZE; int w = t/WARP_SIZE; int lane = t%WARP_SIZE; - int const& rank = prim.rank; - int const& nRanks = prim.nRanks; - uint32_t const& stride4G = prim.stride4G; - Pack* inpRank0 = (Pack*)inputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane; - Pack* outRank0 = (Pack*)outputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane; + int const& rank = handler.comm.rank; + int const& nRanks = handler.comm.nRanks; + + ncclSymPtr inpPacks = (ncclSymPtr)input + intptr_t(w)*UnrollPacks*WARP_SIZE + lane; + ncclSymPtr outPacks = (ncclSymPtr)output + intptr_t(w)*UnrollPacks*WARP_SIZE + lane; Pack acc0[UnrollPacks]; nIters -= w; if (0 < nIters) { #pragma unroll for (int u=0; u < UnrollPacks; u++) { - acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE]; + acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE]; } } - if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed); if (0 < nIters) { while (true) { @@ -39,7 +42,7 @@ static __device__ __forceinline__ void allreduceDeep( { Pack tmp1[UnrollPacks]; #pragma unroll for (int u=0; u < UnrollPacks; u++) { - tmp1[u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE]; + tmp1[u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE]; } #pragma unroll for (int u=0; u < UnrollPacks; u++) { @@ -64,7 +67,7 @@ static __device__ __forceinline__ void allreduceDeep( if (partial && ur!=0 && dr+ur == nRanks) break; #pragma unroll UnrollPacks for (int u=0; u < UnrollPacks; u++) { - tmp1[ur][u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE]; + tmp1[ur][u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE]; } if (++r == nRanks) r = 0; } @@ -95,22 +98,22 @@ static __device__ __forceinline__ void allreduceDeep( if (partial && dr == nRanks) break; #pragma unroll UnrollPacks for (int u=0; u < UnrollPacks; u++) { - add4G(outRank0, r*stride4G)[u*WARP_SIZE] = acc0[u]; + outPacks.peerPtr(world, r)[u*WARP_SIZE] = acc0[u]; } if (++r == nRanks) r = 0; } } } - inpRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE; - outRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE; + inpPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE; + outPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE; nIters -= wn; if (nIters <= 0) break; // Load data for next iteration. #pragma unroll for (int u=0; u < UnrollPacks; u++) { - acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE]; + acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE]; } } } @@ -118,21 +121,23 @@ static __device__ __forceinline__ void allreduceDeep( template static __device__ __forceinline__ void allreduceEnds( - ncclSymPrims& prim, int tn, int t, Red red, - T* inputRank0, T* outputRank0, size_t nElts, uint32_t nPreElts, size_t nSufElts + ncclSymkArgsHandler const& handler, int tn, int t, Red red, + ncclSymPtr input, ncclSymPtr output, + size_t nElts, uint32_t nPreElts, size_t nSufElts ) { using Acc = typename Red::EltType; - int const& rank = prim.rank; - int const& nRanks = prim.nRanks; - uint32_t const& stride4G = prim.stride4G; - BytePack* inpRank0 = (BytePack*)inputRank0; - BytePack* outRank0 = (BytePack*)outputRank0; + ncclTeam world = ncclTeamWorld(handler.comm); + int const& rank = handler.comm.rank; + int const& nRanks = handler.comm.nRanks; + + ncclSymPtr> inpPacks = (ncclSymPtr>)input; + ncclSymPtr> outPacks = (ncclSymPtr>)output; #pragma unroll 1 for (size_t i = t; i < nPreElts+nSufElts; i += tn) { size_t elt = i < nPreElts ? i : nElts-nSufElts-nPreElts+i; - BytePack acc0 = *add4G(inpRank0+elt, rank*stride4G); + BytePack acc0 = inpPacks.peerPtr(world, rank)[elt]; BytePack acc1; BytePack tmp[UnrollPeers]; int dr = 1; @@ -151,7 +156,7 @@ static __device__ __forceinline__ void allreduceEnds( #pragma unroll for (int u=0; u < UnrollPeers-partial; u++) { if (partial && u!=0 && dr+u == nRanks) break; - tmp[u] = *add4G(inpRank0+elt, r*stride4G); + tmp[u] = inpPacks.peerPtr(world, r)[elt]; r += 1; if (r == nRanks) r = 0; } @@ -179,7 +184,7 @@ static __device__ __forceinline__ void allreduceEnds( #pragma unroll for (int u=0; u < UnrollPeers-partial; u++) { if (partial && dr+u == nRanks) break; - *add4G(outRank0+elt, r*stride4G) = acc0; + outPacks.peerPtr(world, r)[elt] = acc0; r += 1; if (r == nRanks) r = 0; } @@ -190,35 +195,33 @@ static __device__ __forceinline__ void allreduceEnds( template static __device__ void allreduce( - ncclSymPrims& prim, int tn, int t, bool waitNeeded, - Red red, T* input, T* output, size_t nElts + ncclSymkArgsHandler const& handler, int tn, int t, int nBlocks, + bool waitNeeded, ncclLsaBarrierSession& bar, + Red red, ncclSymPtr input, ncclSymPtr output, size_t nElts ) { - int nRanks = prim.nRanks; - int nBlocks = prim.nBlocks; - // Mpve to rank=0 - input = prim.peerPtr(0, input); - output = prim.peerPtr(0, output); - - uintptr_t inputUptr = reinterpret_cast(input); - uintptr_t outputUptr = reinterpret_cast(output); + int const& nRanks = handler.comm.nRanks; + int const& nRanks_rcp32 = handler.nRanks_rcp32; size_t nBytes = nElts*sizeof(T); + uint32_t nBlocks_rcp32 = nccl::utility::idivRcp32_upto64(nBlocks); + uint32_t nRanks_nBlocks_rcp32 = nccl::utility::imulRcp32(nRanks, nRanks_rcp32, nBlocks, nBlocks_rcp32); - uint32_t nPreBytes = (16u - inputUptr)%16u; + uint32_t nPreBytes = (16u - input.offset)%16u; nPreBytes = min((size_t)nPreBytes, nBytes); uintptr_t cursor = nPreBytes; constexpr int MinWarpPerBlock = 4; - if ((inputUptr-outputUptr)%16 == 0) { + if ((input.offset - output.offset)%16 == 0) { constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2; constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack; uint32_t chunks = (nBytes-cursor)/BytePerChunk; - chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32); + chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32); if (chunks != 0) { uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk; allreduceDeep( - prim, tn, t, waitNeeded, red, - (char*)input + cursor, (char*)output + cursor, + handler, tn, t, waitNeeded, bar, red, + (ncclSymPtr)input + cursor, + (ncclSymPtr)output + cursor, chunks*MinWarpPerBlock ); cursor = cursorAfter; @@ -226,16 +229,17 @@ static __device__ void allreduce( } } - if (sizeof(T) == 4 || (sizeof(T) < 4 && (inputUptr-outputUptr)%4 == 0)) { + if (sizeof(T) == 4 || (sizeof(T) < 4 && (input.offset - output.offset)%4 == 0)) { constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4; constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack; uint32_t chunks = (nBytes-cursor)/BytePerChunk; - chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32); + chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32); if (chunks != 0) { uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk; allreduceDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers, T>( - prim, tn, t, waitNeeded, red, - (char*)input + cursor, (char*)output + cursor, + handler, tn, t, waitNeeded, bar, red, + (ncclSymPtr)input + cursor, + (ncclSymPtr)output + cursor, chunks*MinWarpPerBlock ); cursor = cursorAfter; @@ -243,46 +247,51 @@ static __device__ void allreduce( } } - if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed); constexpr int UnrollPeers = 8; size_t nSufElts = (nBytes-cursor)/sizeof(T); - allreduceEnds(prim, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts); + allreduceEnds(handler, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts); } - template typename Red, typename T> -__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLD_AGxST(ncclSymDevArgs const* args) { - ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier); - int /*const&*/ rank = prim.rank; - int /*const&*/ nRanks = prim.nRanks; - Red::Type> red(args->redOpArg); - - // Threads numbered globally such that we round robin warps by rank then block. - int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE, - rank, nRanks, - prim.block, prim.nBlocks, - threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); - int gtn = nRanks*prim.nBlocks*blockDim.x; - - prim.barrierArrive(ncclCoopCta(), /*release=*/false); - //prim.barrierWait(ncclCoopCta(), /*acquire=*/false); - - allreduce(prim, gtn, gt, /*waitNeeded=*/true, red, (T*)args->input, (T*)args->output, args->nElts); - - prim.barrierArrive(ncclCoopCta(), /*release=*/true); - prim.barrierWait(ncclCoopCta(), /*acquire=*/false); -} +__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLD_AGxST(ncclSymkDevWorkArgs const* args) { + ncclSymkArgsHandler handler{args}; + ncclLsaBarrierSession bar{ + ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x + }; + + Red::Type> red(handler.devWork->redOpArg); + + int const& rank = handler.comm.rank; + int const& nRanks = handler.comm.nRanks; + bar.arrive(ncclCoopCta(), cuda::memory_order_relaxed); + + bool waitNeeded = true; + handler.forEachWork( + [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts, + ncclSymPtr input, ncclSymPtr output) { + // Threads numbered globally such that we round robin warps by rank then block. + int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE, + rank, nRanks, + block, nBlocks, + threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); + int gtn = nRanks*nBlocks*blockDim.x; + + allreduce(handler, gtn, gt, nBlocks, waitNeeded, bar, red, input, output, nElts); + + waitNeeded = false; + } + ); + + bar.sync(ncclCoopCta(), cuda::memory_order_release); +} template static __device__ void allreduceMultimem( - ncclSymPrims& prim, int tn, int t, Red red, T* input, T* output, size_t nElts + int tn, int t, Red red, T* input, T* output, size_t nElts ) { - // Mpve to multimem - input = prim.multimemPtr(input); - output = prim.multimemPtr(output); - uintptr_t inputUptr = reinterpret_cast(input); uintptr_t outputUptr = reinterpret_cast(output); size_t nBytes = nElts*sizeof(T); @@ -327,106 +336,132 @@ static __device__ void allreduceMultimem( uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes); BytePack val = applyLoadMultimem(red, inputUptr + cursor); multimem_st_global(outputUptr + cursor, val); - cursor += tn*sizeof(T); } } template typename Red, typename T> -__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLDMC_AGxSTMC(ncclSymDevArgs const* args) { - ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem); - Red::Type> red(args->redOpArg); - - // Threads numbered globally such that we round robin warps by rank then block. - int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE, - prim.rank, prim.nRanks, - prim.block, prim.nBlocks, - threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); - int gtn = prim.nRanks*prim.nBlocks*blockDim.x; - - prim.barrierArrive(ncclCoopCta(), /*release=*/false); - prim.barrierWait(ncclCoopCta(), /*acquire=*/false); - - allreduceMultimem(prim, gtn, gt, red, (T*)args->input, (T*)args->output, args->nElts); +__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLDMC_AGxSTMC(ncclSymkDevWorkArgs const* args) { + ncclSymkArgsHandler handler{args}; + ncclLsaBarrierSession bar{ + ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x, /*multimem=*/true + }; + + Red::Type> red(handler.devWork->redOpArg); + + int const& rank = handler.comm.rank; + int const& nRanks = handler.comm.nRanks; + auto const& multimem = handler.comm.lsaMultimem; + + bar.sync(ncclCoopCta(), cuda::memory_order_relaxed); + + handler.forEachWork( + [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts, + ncclSymPtr input, ncclSymPtr output) { + // Threads numbered globally such that we round robin warps by rank then block. + int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE, + rank, nRanks, + block, nBlocks, + threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); + int gtn = nRanks*nBlocks*blockDim.x; + + allreduceMultimem(gtn, gt, red, input.multimemPtr(multimem), output.multimemPtr(multimem), nElts); + } + ); - prim.barrierArrive(ncclCoopCta(), /*release=*/true); - prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + bar.sync(ncclCoopCta(), cuda::memory_order_release); } template typename Red, typename T> -__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R_impl(ncclSymDevArgs const* args, bool multimem) { - ncclSymPrims prim(args->comm, ncclSymPrims_UseLL | multimem*ncclSymPrims_UseMultimem); - int /*const&*/ rank = prim.rank; - using Acc = typename ncclSymAccumType::Type; - Red red(args->redOpArg); +__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLL_R_impl(ncclSymkDevWorkArgs const* args, bool multimem) { + ncclSymkArgsHandler handler{args}; + ncclLLA2ASession lla2a( + ncclCoopCta(), handler.comm, ncclTeamLsa(handler.comm), handler.lsaLLA2A, + blockIdx.x, ncclSymkMaxThreads, multimem, handler.comm.lsaMultimem + ); + + int const& rank = handler.comm.rank; + int const& nRanks = handler.comm.nRanks; + using Acc = typename ncclSymkAccumType::Type; + Red red(handler.devWork->redOpArg); using Pack = BytePack<8>; using AccPack = BytePack<8*sizeof(Acc)/sizeof(T)>; constexpr int EltPerPack = 8/sizeof(T); - int nElts = args->nElts; - int nPacks = divUp(nElts, EltPerPack); - - bool packAligned = 8 <= alignof(T) || ( - args->nElts*sizeof(T) | - (uint32_t)reinterpret_cast(args->input) | - (uint32_t)reinterpret_cast(args->output) - )%8 == 0; - - uint32_t nPackPerBlock, nPackModBlock; - idivmodFast32(&nPackPerBlock, &nPackModBlock, nPacks, prim.nBlocks, prim.nBlocks_rcp32); - int begin = prim.block*nPackPerBlock + minval(prim.block, nPackModBlock); - int end = begin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0); - - nPacks = end - begin; - nElts -= begin*EltPerPack; - nElts = min(nElts, nPacks*EltPerPack); - T* input = (T*)args->input + begin*EltPerPack; - T* output = (T*)args->output + begin*EltPerPack; - - ncclCoopCta cta; - int t = threadIdx.x; - int tn = ncclSymMaxThreads; - - if (__builtin_expect(packAligned, true)) { - #pragma unroll 1 - while (0 < nPacks) { - if (t < nPacks) { - int nIterPacks = min(nPacks, tn); - Pack inp = loadPack((Pack*)input, t, nPacks); - prim.bcastLL(/*slot=*/nIterPacks*rank + t, inp); - Pack out = prim.template recvReduceLL(t, nIterPacks, red); - storePack((Pack*)output, t, nPacks, out); - } - prim.endLL(cta); - input += tn*EltPerPack; - output += tn*EltPerPack; - nPacks -= tn; - } - } else { - #pragma unroll 1 - while (0 < nElts) { - if (t*EltPerPack < nElts) { - int nIterPacks = min(nPacks, tn); - Pack inp = loadPack(input, t*EltPerPack, nElts); - prim.bcastLL(/*slot=*/nIterPacks*rank + t, inp); - Pack out = prim.template recvReduceLL(t, nIterPacks, red); - storePack(output, t*EltPerPack, nElts, out); - } - prim.endLL(cta); + handler.singleWork( + [&]__device__(int nElts, int nAllElts, + ncclSymPtr inputPtr, ncclSymPtr outputPtr) { + int nPacks = divUp(nElts, EltPerPack); + + T* input = (T*)inputPtr.localPtr(); + T* output = (T*)outputPtr.localPtr(); + + bool packAligned = 8 <= alignof(T) || (nElts*sizeof(T) | (uintptr_t)input | (uintptr_t)output)%8 == 0; + + ncclCoopCta cta; + int t = threadIdx.x; + int tn = ncclSymkMaxThreads; + + if (__builtin_expect(packAligned, true)) { + #pragma unroll 1 + while (0 < nPacks) { + if (t < nPacks) { + int nIterPacks = min(nPacks, tn); + Pack inp = loadPack((Pack*)input, t, nPacks); + lla2a.bcast(/*slot=*/nIterPacks*rank + t, inp); + AccPack out = lla2a.template recvReduce( + /*slotStart=*/t, /*slotCount=*/nRanks, /*slotStride=*/nIterPacks, + /*eltToAcc=*/[&] __device__ (Pack x)->AccPack { + return applyCast(x); + }, + /*reduce=*/[&] __device__ (AccPack a, AccPack b)->AccPack { + return applyReduce(red, a, b); + } + ); + storePack((Pack*)output, t, nPacks, applyCast(out)); + } + lla2a.endEpoch(cta); - input += tn*EltPerPack; - output += tn*EltPerPack; - nElts -= tn*EltPerPack; - nPacks -= tn; - } - } + input += tn*EltPerPack; + output += tn*EltPerPack; + nPacks -= tn; + } + } else { + #pragma unroll 1 + while (0 < nElts) { + if (t*EltPerPack < nElts) { + int nIterPacks = min(nPacks, tn); + Pack inp = loadPack(input, t*EltPerPack, nElts); + lla2a.bcast(/*slot=*/nIterPacks*rank + t, inp); + AccPack out = lla2a.template recvReduce( + /*slotStart=*/t, /*slotCount=*/nRanks, /*slotStride=*/nIterPacks, + /*eltToAcc=*/[&] __device__ (Pack x)->AccPack { + return applyCast(x); + }, + /*reduce=*/[&] __device__ (AccPack a, AccPack b)->AccPack { + return applyReduce(red, a, b); + } + ); + storePack(output, t*EltPerPack, nElts, applyCast(out)); + } + lla2a.endEpoch(cta); + + input += tn*EltPerPack; + output += tn*EltPerPack; + nElts -= tn*EltPerPack; + nPacks -= tn; + } + } + } + ); } template typename Red, typename T> -__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R(ncclSymDevArgs const* args) { - ncclSymRun_AllReduce_AGxLL_R_impl(args, /*multimem=*/false); +__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLL_R(ncclSymkDevWorkArgs const* args) { + ncclSymkRun_AllReduce_AGxLL_R_impl(args, /*multimem=*/false); } + template typename Red, typename T> -__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLLMC_R(ncclSymDevArgs const* args) { - ncclSymRun_AllReduce_AGxLL_R_impl(args, /*multimem=*/true); +__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLLMC_R(ncclSymkDevWorkArgs const* args) { + ncclSymkRun_AllReduce_AGxLL_R_impl(args, /*multimem=*/true); } diff --git a/src/device/symmetric/generate.py b/src/device/symmetric/generate.py index 8fcb9a425..8e62bda5b 100755 --- a/src/device/symmetric/generate.py +++ b/src/device/symmetric/generate.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import os import sys +import shutil ################################################################################ # The first command line argument is the path to the directory to generate and @@ -10,8 +11,11 @@ if os.path.exists(gensrc): for name in os.listdir(gensrc): - os.remove(os.path.join(gensrc, name)) - #os.truncate(os.path.join(gensrc, name), 0) + path = os.path.join(gensrc, name) + if os.path.isfile(path): + os.remove(path) + elif os.path.isdir(path): + shutil.rmtree(path) else: os.mkdir(gensrc) @@ -94,7 +98,7 @@ def enumerate_kernels(): yield Rec(coll="ReduceScatter", algo=algo, red=red, ty=ty) def required_cuda(k): - cudart, arch, specific_sms = 0, 0, None + cudart, arch, specific_sms = 0, 600, None is_nvls = k.algo in nvls_algos_by_coll.get(k.coll, []) if is_nvls: cudart = max(cudart, 12010) @@ -133,13 +137,13 @@ def kernel_gencode(k): def kernel_cname(k): if k.coll in reductions: - return paste("_", "ncclSymDevKernel", k.coll, k.algo, k.red, k.ty) + return paste("_", "ncclSymkDevKernel", k.coll, k.algo, k.red, k.ty) else: - return paste("_", "ncclSymDevKernel", k.coll, k.algo) + return paste("_", "ncclSymkDevKernel", k.coll, k.algo) def kernel_conds(k): cudart, arch, specific_sms = required_cuda(k) - if cudart == 0: return (None, None) + if cudart == 0 and arch == 0: return (None, None) cudart_cond = "CUDART_VERSION >= %d"%cudart if not specific_sms: @@ -152,30 +156,30 @@ def instantiate(k): cudart_cond, arch_cond = kernel_conds(k) if (cudart_cond, arch_cond) == (None, None): form_red_ty = ( - "__global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const args) {{\n" - " ncclSymRun_{id}<{red}, {ty}>(&args);\n" + "__global__ void {cname}(ncclSymkDevWorkArgs4K NCCL_GRID_CONSTANT const args4K) {{\n" + " ncclSymkRun_{id}<{red}, {ty}>(&args4K.args);\n" "}}" ) form = ( - "__global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const args) {{\n" - " ncclSymRun_{id}(&args);\n" + "__global__ void {cname}(ncclSymkDevWorkArgs4K NCCL_GRID_CONSTANT const args4K) {{\n" + " ncclSymkRun_{id}(&args4K.args);\n" "}}" ) else: form_red_ty = ( "#if {cudart_cond}\n" - " __global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const args) {{\n" + " __global__ void {cname}(ncclSymkDevWorkArgs4K NCCL_GRID_CONSTANT const args4K) {{\n" " #if {arch_cond}\n" - " ncclSymRun_{id}<{red}, {ty}>(&args);\n" + " ncclSymkRun_{id}<{red}, {ty}>(&args4K.args);\n" " #endif\n" " }}\n" "#endif" ) form = ( "#if {cudart_cond}\n" - " __global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const args) {{\n" + " __global__ void {cname}(ncclSymkDevWorkArgs4K NCCL_GRID_CONSTANT const args4K) {{\n" " #if {arch_cond}\n" - " ncclSymRun_{id}(&args);\n" + " ncclSymkRun_{id}(&args4K.args);\n" " #endif\n" " }}\n" "#endif" @@ -192,11 +196,11 @@ def instantiate(k): def prototype(k): cudart_cond, arch_cond = kernel_conds(k) if cudart_cond is None: - form = "__global__ void {cname}(ncclSymDevArgs const);" + form = "__global__ void {cname}(ncclSymkDevWorkArgs4K const);" else: form = ( "#if {cudart_cond}\n" - " __global__ void {cname}(ncclSymDevArgs const);\n" + " __global__ void {cname}(ncclSymkDevWorkArgs4K const);\n" "#else\n" " constexpr void* {cname} = nullptr;\n" "#endif" @@ -223,18 +227,20 @@ def partition(vals, keyfn): if (fname, coll) not in kernels_by_file: kernels_by_file[fname, coll] = [] +files_to_print = "" # Generate each kernel instantiation file for (fname, coll), ks in kernels_by_file.items(): + files_to_print += fname + ";" with open(os.path.join(gensrc, fname), "w") as f: - emitln(f, '#include "symmetric.h"') + emitln(f, '#include "sym_kernels.h"') emitln(f, '#include "symmetric/kernel.cuh"') emitln(f, '#include "symmetric/{coll}.cuh"'.format(coll=coll_to_lower[coll])) for k in ks: emitln(f, instantiate(k)) -# Generate /symmetric_host.cc -with open(os.path.join(gensrc, "symmetric_kernels.cc"), "w") as f: - emitln(f, '#include "symmetric.h"') +# Generate /sym_kernels_host.cc +with open(os.path.join(gensrc, "sym_kernels_host.cc"), "w") as f: + emitln(f, '#include "sym_kernels.h"') emitln(f, '#include "device.h"') emitln(f, '') @@ -242,19 +248,19 @@ def partition(vals, keyfn): emitln(f, prototype(k)) emitln(f, '') - emitln(f, 'extern int const ncclSymKernelCount = %d;' % len(list(enumerate_kernels()))) - emitln(f, 'extern void* const ncclSymKernelList[] = {') + emitln(f, 'extern int const ncclSymkKernelCount = %d;' % len(list(enumerate_kernels()))) + emitln(f, 'extern void* const ncclSymkKernelList[] = {') for k in enumerate_kernels(): emitln(f, '(void*){cname},'.format(cname=kernel_cname(k))) emitln(f, 'nullptr};') emitln(f, '') - emitln(f, 'void* ncclSymGetKernelPtr(ncclSymKernelId id, int red, ncclDataType_t ty) {') + emitln(f, 'void* ncclSymkGetKernelPtr(ncclSymkKernelId id, int red, ncclDataType_t ty) {') indents += 1 emitln(f, 'switch (id) {') emitln(f, 'default: return nullptr;') for (coll, algo), coll_algo_ks in partition(enumerate_kernels(), lambda k: (k.coll, k.algo)).items(): - emitln(f, 'case ncclSymKernelId_'+coll+'_'+algo+':') + emitln(f, 'case ncclSymkKernelId_'+coll+'_'+algo+':') indents += 1 if len(coll_algo_ks) == 1: emitln(f, 'return (void*)&'+kernel_cname(coll_algo_ks[0])+';') @@ -277,9 +283,15 @@ def partition(vals, keyfn): emitln(f, '}') # Generate /rules.mk +files_to_print += "rules.mk;" +files_to_print += "sym_kernels_host.cc;" + +if os.environ.get("NCCL_USE_CMAKE", "0") == "1": + print(files_to_print) + with open(os.path.join(gensrc, "rules.mk"), "w") as f: inst_names = sorted(set(kernel_fname(k) for k in enumerate_kernels())) - names = inst_names + ["symmetric_kernels.cc"] + names = inst_names + ["sym_kernels_host.cc"] f.write("LIB_OBJS_SYM_GEN = $(patsubst %,$(OBJDIR)/genobj/symmetric/%.o,{names})\n" .format(names=" ".join(names))) f.write("\n") diff --git a/src/device/symmetric/kernel.cuh b/src/device/symmetric/kernel.cuh index f631d51d9..bff67e460 100644 --- a/src/device/symmetric/kernel.cuh +++ b/src/device/symmetric/kernel.cuh @@ -1,27 +1,27 @@ #ifndef NCCL_DEVICE_SYMMETRIC_KERNEL_H_ #define NCCL_DEVICE_SYMMETRIC_KERNEL_H_ -#include "symmetric.h" +#include "sym_kernels.h" template typename Red, typename T> -__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R(struct ncclSymDevArgs const* args); +__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLL_R(struct ncclSymkDevWorkArgs const* args); template typename Red, typename T> -__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLLMC_R(struct ncclSymDevArgs const* args); +__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLLMC_R(struct ncclSymkDevWorkArgs const* args); template typename Red, typename T> -__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLD_AGxST(struct ncclSymDevArgs const* args); +__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLD_AGxST(struct ncclSymkDevWorkArgs const* args); template typename Red, typename T> -__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLDMC_AGxSTMC(struct ncclSymDevArgs const* args); +__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLDMC_AGxSTMC(struct ncclSymkDevWorkArgs const* args); -__device__ __forceinline__ void ncclSymRun_AllGather_LL(struct ncclSymDevArgs const* args); -__device__ __forceinline__ void ncclSymRun_AllGather_LLMC(struct ncclSymDevArgs const* args); -__device__ __forceinline__ void ncclSymRun_AllGather_ST(struct ncclSymDevArgs const* args); -__device__ __forceinline__ void ncclSymRun_AllGather_STMC(struct ncclSymDevArgs const* args); +__device__ __forceinline__ void ncclSymkRun_AllGather_LL(struct ncclSymkDevWorkArgs const* args); +__device__ __forceinline__ void ncclSymkRun_AllGather_LLMC(struct ncclSymkDevWorkArgs const* args); +__device__ __forceinline__ void ncclSymkRun_AllGather_ST(struct ncclSymkDevWorkArgs const* args); +__device__ __forceinline__ void ncclSymkRun_AllGather_STMC(struct ncclSymkDevWorkArgs const* args); template typename Red, typename T> -__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL(struct ncclSymDevArgs const* args); +__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LL(struct ncclSymkDevWorkArgs const* args); template typename Red, typename T> -__device__ __forceinline__ void ncclSymRun_ReduceScatter_LD(struct ncclSymDevArgs const* args); +__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LD(struct ncclSymkDevWorkArgs const* args); template typename Red, typename T> -__device__ __forceinline__ void ncclSymRun_ReduceScatter_LDMC(struct ncclSymDevArgs const* args); +__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LDMC(struct ncclSymkDevWorkArgs const* args); #endif diff --git a/src/device/symmetric/primitives.cuh b/src/device/symmetric/primitives.cuh index 167024400..73305d54c 100644 --- a/src/device/symmetric/primitives.cuh +++ b/src/device/symmetric/primitives.cuh @@ -1,11 +1,11 @@ #ifndef NCCL_DEVICE_SYMMETRIC_PRIMITIVES_H_ #define NCCL_DEVICE_SYMMETRIC_PRIMITIVES_H_ -#include "symmetric.h" +#include "sym_kernels.h" #include "bitops.h" #include "collectives.h" -#include "op128.h" -#include "reduce_kernel.h" +#include "../op128.h" +#include "../reduce_kernel.h" #if __CUDA_ARCH__ >= 700 // __grid_constant__ appears to break cuda-gdb @@ -24,397 +24,124 @@ static __device__ Int0 flattenIx(Int0 pos, Int1 size, Ints ...more) { return pos + size*flattenIx(more...); } -// Precomputed integer reciprocoals for denominator values 1..64 inclusive. -// Pass these to idivFast64() for fast division on the GPU. -static __device__ uint64_t idivRcp64_upto64(int x) { - static constexpr uint64_t table[65] = { - idivRcp64(0x01), idivRcp64(0x01), idivRcp64(0x02), idivRcp64(0x03), - idivRcp64(0x04), idivRcp64(0x05), idivRcp64(0x06), idivRcp64(0x07), - idivRcp64(0x08), idivRcp64(0x09), idivRcp64(0x0a), idivRcp64(0x0b), - idivRcp64(0x0c), idivRcp64(0x0d), idivRcp64(0x0e), idivRcp64(0x0f), - idivRcp64(0x10), idivRcp64(0x11), idivRcp64(0x12), idivRcp64(0x13), - idivRcp64(0x14), idivRcp64(0x15), idivRcp64(0x16), idivRcp64(0x17), - idivRcp64(0x18), idivRcp64(0x19), idivRcp64(0x1a), idivRcp64(0x1b), - idivRcp64(0x1c), idivRcp64(0x1d), idivRcp64(0x1e), idivRcp64(0x1f), - idivRcp64(0x20), idivRcp64(0x21), idivRcp64(0x22), idivRcp64(0x23), - idivRcp64(0x24), idivRcp64(0x25), idivRcp64(0x26), idivRcp64(0x27), - idivRcp64(0x28), idivRcp64(0x29), idivRcp64(0x2a), idivRcp64(0x2b), - idivRcp64(0x2c), idivRcp64(0x2d), idivRcp64(0x2e), idivRcp64(0x2f), - idivRcp64(0x30), idivRcp64(0x31), idivRcp64(0x32), idivRcp64(0x33), - idivRcp64(0x34), idivRcp64(0x35), idivRcp64(0x36), idivRcp64(0x37), - idivRcp64(0x38), idivRcp64(0x39), idivRcp64(0x3a), idivRcp64(0x3b), - idivRcp64(0x3c), idivRcp64(0x3d), idivRcp64(0x3e), idivRcp64(0x3f), - idivRcp64(0x40) - }; - return table[x]; -} - -static __device__ uint32_t idivRcp32_upto64(int x) { - return idivRcp64_upto64(x)>>32; -} - namespace { -struct ncclCoopCta { - __device__ void sync() { __syncthreads(); } - __device__ int self() { return threadIdx.x; } - __device__ int count() { return blockDim.x; } -}; -struct ncclCoopWarps { - int log2_nWarps; - __device__ void sync() { - asm volatile("barrier.sync %0, %1;" :: "r"(1 + (threadIdx.x>>(5+log2_nWarps))), "r"(32<kcomm.devComm), + lsaLLA2A(args->kcomm.lsaLLA2A) { + channelWorkRange = args->getWorkRange(); + + devWork = args->getWorks(args->nMaxChannels); + nRanks_rcp32 = comm.nRanks_rcp32; } - __device__ int self() { return threadIdx.x & ((32<= 12030 && __CUDA_ARCH__ >= 900 - cudaGridDependencySynchronize(); - #endif - - if ((flags & ncclSymPrims_UseBarrier) && threadIdx.x < nRanks) { - barEpoch = (flags & ncclSymPrims_UseMultimem) ? base->barEpochMc[block] : base->barEpochUc[block]; - } - if (flags & ncclSymPrims_UseLL) llEpoch = base->llEpoch[block] + 2; - } - __device__ ~ncclSymPrims() { - if (threadIdx.x == 0) { - if (flags & ncclSymPrims_UseBarrier) { - ((flags & ncclSymPrims_UseMultimem) ? base->barEpochMc : base->barEpochUc)[block] = barEpoch; - } - if (flags & ncclSymPrims_UseLL) base->llEpoch[block] = llEpoch - 2; + template + __device__ void getWorkRange(int block, + uint16_t& workLo, size_t& indexLo, uint16_t& workHi, size_t& indexHi) { + constexpr int EltPerCell = NCCL_SYM_KERNEL_CELL_SIZE / sizeof(T); + uint32_t fracLo, fracHi; + + // Where the work begins + workLo = (block==0) ? 0 : channelWorkRange[block-1].workHi; // start where predecessor ends + fracLo = (block==0) ? 0 : channelWorkRange[block-1].fracHi + 1; + // If the predecessor ended on the work boundary, then we step to the beginning of the next work. + // This ensures we never have empty parts. + if (fracLo == 0x10000) { + workLo++; + fracLo = 0; } - } + struct ncclSymkDevWork const& dw = devWork[workLo]; + indexLo = ((fracLo * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell; - template - __device__ T* peerPtr(int peer, T* selfPtr) { - return add4G(selfPtr, (peer-rank)*stride4G); + // Where the work ends + workHi = channelWorkRange[block].workHi; + fracHi = channelWorkRange[block].fracHi + 1; + indexHi = min(((fracHi * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell, dw.nElts); } template - __device__ T* multimemPtr(T* selfPtr) { - return reinterpret_cast(reinterpret_cast(selfPtr) + offsetMc); + __device__ void getWorkRangeFused(int blockIdx, int w, + int& block, int& nBlocks, size_t& indexLo, size_t& indexHi) { + constexpr int EltPerCell = NCCL_SYM_KERNEL_CELL_SIZE / sizeof(T); + struct ncclSymkDevWork const& dw = devWork[w]; + uint32_t fracLo, fracHi; + int lastBlock; + + block = blockIdx - dw.sChannelId; + nBlocks = dw.nChannels; + lastBlock = dw.sChannelId+dw.nChannels-1; + + // Where the work begins + fracLo = (dw.sChannelId==0) ? 0 : ((channelWorkRange[dw.sChannelId-1].fracHi + 1) & 0xFFFF); + indexLo = ((fracLo * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell; + fracHi = (channelWorkRange[lastBlock].workHi == w) ? channelWorkRange[lastBlock].fracHi + 1 : 0x10000; + indexHi = min(((fracHi * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell, dw.nElts); } - __device__ void barrierArrive(ncclCoopCta cta, bool release) { - cta.sync(); - #if __CUDA_ARCH__ < 700 - if (release) { - if (cta.self() == 0) __threadfence_system(); - cta.sync(); - } - #endif - if (flags & ncclSymPrims_UseMultimem) { - #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010 - if (cta.self() == 0) { - uint32_t* inbox = &multimemPtr(base)->barInboxMc[block]; - if (release) { - asm volatile("multimem.red.release.sys.add.u32 [%0],1;" :: "l"(inbox)); - } else { - asm volatile("multimem.red.relaxed.sys.add.u32 [%0],1;" :: "l"(inbox)); - } - } - #endif - } else { - int r = cta.self(); - if (r != rank && r < nRanks) { - uint32_t* inbox = &peerPtr(r, base)->barInboxPerPeer[block*nRanks + rank]; - #if __CUDA_ARCH__ >= 700 - if (release) { - asm volatile("st.release.sys.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1)); - } else { - asm volatile("st.relaxed.sys.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1)); - } - #else - asm volatile("st.volatile.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1)); - #endif - } - } - } + template + __device__ void forEachWork(Fn const& fn) { + uint16_t workLo, workHi; + size_t indexLo, indexHi; - __device__ void barrierWait(ncclCoopCta cta, bool acquire) { - if (flags & ncclSymPrims_UseMultimem) { - #if __CUDA_ARCH__ >= 900 - if (cta.self() == 0) { - uint32_t* inbox = &base->barInboxMc[block]; - while (true) { - uint32_t got; - if (acquire) { - asm volatile("ld.acquire.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox)); - } else { - asm volatile("ld.relaxed.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox)); - } - if (got-(barEpoch+nRanks) <= uint32_t(-1)>>1) break; - } - barEpoch += nRanks; - } - #endif - } else { - int r = cta.self(); - if (r != rank && r < nRanks) { - uint32_t* inbox = &base->barInboxPerPeer[block*nRanks + r]; - while (true) { - uint32_t got; - #if __CUDA_ARCH__ >= 700 - if (acquire) { - asm volatile("ld.acquire.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox)); - } else { - asm volatile("ld.relaxed.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox)); - } - #else - asm volatile("ld.volatile.u32 %0,[%1];" : "=r"(got) : "l"(inbox)); - #endif - if (got-(barEpoch+1) <= uint32_t(-1)>>1) break; - } - } - #if __CUDA_ARCH__ < 700 - if (acquire) { - cta.sync(); - if (cta.self() == 0) __threadfence(); - } - #endif - barEpoch += 1; - } - cta.sync(); - } + getWorkRange(blockIdx.x, workLo, indexLo, workHi, indexHi); - __device__ void endLL(ncclCoopCta cta) { - if (__builtin_expect(llEpoch >= -2u, false)) { - cta.sync(); - uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch); - int epochSize = ncclSymLLEpochSize(nRanks); - #pragma unroll 4 - for (int i=cta.self(); i*16 < epochSize; i += cta.count()) { - buf[i] = uint4{0, 0, 0, 0}; - } - } - cta.sync(); - llEpoch += (llEpoch == -1u) ? 3 : 1; - } - - template - __device__ void sendLL(int peer, int slot, T val) { - union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; }; - tmp = val; - uint4* buf = ncclSymDevBase_getLLBuf(peerPtr(peer, base), nRanks, block, llEpoch) + slot; - #pragma unroll - for (int u=0; u < divUp(sizeof(T),8); u++) { - asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch)); - } - } - - template - __device__ void bcastLL(int slot, T val) { - if (flags & ncclSymPrims_UseMultimem) { - union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; }; - tmp = val; - uint4* bufmc = ncclSymDevBase_getLLBuf(multimemPtr(base), nRanks, block, llEpoch) + slot; - #pragma unroll - for (int u=0; u < divUp(sizeof(T),8); u++) { - asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(bufmc + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch)); - } - } else { - union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; }; - tmp = val; - uint4* buf0 = ncclSymDevBase_getLLBuf(peerPtr(0, base), nRanks, block, llEpoch) + slot; - int dr = 0; - int r = rank; + size_t currentIndexLo = indexLo; #pragma unroll 1 - for (; dr+8 <= nRanks; dr += 8) { - #pragma unroll - for (int ur=0; ur < 8; ur++) { - uint4* buf = add4G(buf0, r*stride4G); - #pragma unroll - for (int u=0; u < divUp(sizeof(T),8); u++) { - asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch)); - } - r += 1; - if (r == nRanks) r = 0; - } - } - #pragma unroll - for (int ur=0; ur < 8; ur++, dr++) { - if (dr == nRanks) break; - uint4* buf = add4G(buf0, r*stride4G); - #pragma unroll - for (int u=0; u < divUp(sizeof(T),8); u++) { - asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch)); + for (int w = workLo; w <= workHi; w++) { + struct ncclSymkDevWork const& dw = devWork[w]; + size_t const& nAllElts = dw.nElts; + size_t currentIndexHi; + int block, nBlocks; + if (blockIdx.x >= dw.sChannelId && blockIdx.x < dw.sChannelId + dw.nChannels) { + getWorkRangeFused(blockIdx.x, w, block, nBlocks, currentIndexLo, currentIndexHi); + } else { + currentIndexHi = (w < workHi) ? nAllElts : indexHi; + block = 0; + nBlocks = 1; } - r += 1; - if (r == nRanks) r = 0; - } - } - } - template - __device__ void recvLL(int slot0, int nSlots, int stride, T(&elts)[nSlotsMax]) { - uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch) + slot0; - uint4 tmp[nSlotsMax][divUp(sizeof(T),8)]; - //int spins=0; - while (true) { - #pragma unroll - for (int u=0; u < nSlotsMax; u++) { - if (u < nSlotsMin || u < nSlots) { - #pragma unroll - for (int v=0; v < divUp(sizeof(T),8); v++) { - asm volatile("ld.volatile.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(tmp[u][v].x), "=r"(tmp[u][v].y), "=r"(tmp[u][v].z), "=r"(tmp[u][v].w) : "l"(buf + u*stride + v*ncclSymLLMaxSlots(sizeof(T)))); - } - } - } - bool okAll = true; - #pragma unroll - for (int u=0; u < nSlotsMax; u++) { - #pragma unroll - for (int v=0; v < divUp(sizeof(T),8); v++) { - if (u < nSlotsMin || u < nSlots) { - bool ok = tmp[u][v].y == llEpoch && - tmp[u][v].w == llEpoch; - okAll &= ok; - } - } - } - if (__builtin_expect(okAll, true)) break; - //if (spins++ == 10<<20) spins=0; - } - #pragma unroll - for (int u=0; u < nSlotsMax; u++) { - if (nSlotsMin <= u && u == nSlots) break; - union { T val; uint32_t u32[divUp(sizeof(T),8)][2]; }; - #pragma unroll - for (int v=0; v < divUp(sizeof(T),8); v++) { - u32[v][0] = tmp[u][v].x; - u32[v][1] = tmp[u][v].z; - } - elts[u] = val; - } - } + fn(block, nBlocks, currentIndexHi - currentIndexLo, nAllElts, + ncclSymPtr(dw.inputWin, dw.inputOff) + currentIndexLo, + ncclSymPtr(dw.outputWin, dw.outputOff) + currentIndexLo); - template - __device__ Pack recvReduceLL(int slot, int stride, Red red) { - using Acc = typename Red::EltType; - using AccPack = BytePack; - AccPack acc; - bool first = true; - int r = 0; - #pragma unroll 1 - for (; r+Unroll <= nRanks; r += Unroll) { - Pack got[Unroll]; - this->template recvLL(slot + r*stride, Unroll, stride, got); - AccPack acc0 = applyCast(got[0]); - acc = first ? acc0 : applyReduce(red, acc, acc0); - first = false; - #pragma unroll - for (int i=1; i < Unroll; i++) acc = applyReduce(red, acc, applyCast(got[i])); - } - if (r < nRanks) { - Pack got[Unroll]; - this->template recvLL(slot + r*stride, nRanks-r, stride, got); - AccPack acc0 = applyCast(got[0]); - acc = first ? acc0 : applyReduce(red, acc, acc0); - #pragma unroll - for (int i=1; i < Unroll-1; i++) { - if (r+i < nRanks) acc = applyReduce(red, acc, applyCast(got[i])); + currentIndexLo = 0; } - } - return applyCast(acc); } - template - __device__ T recvLL(int slot) { - T one[1]; - this->template recvLL<1, 1, T>(slot, 1, 0, one); - return one[0]; - } + template + __device__ void singleWork(Fn const& fn) { + uint16_t w; + size_t indexLo, indexHi; - template - __device__ void coopRecvLL(Coop coop, int slot0, int nSlots, T* dst) { - int me = coop.self(); - if (me < nSlots) { - uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch) + slot0 + me; - uint4 got[divUp(sizeof(T), 8)]; - //int spins=0; - #pragma unroll 1 - while (true) { - #pragma unroll - for (int u=0; u < divUp(sizeof(T), 8); u++) { - asm volatile("ld.volatile.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(got[u].x), "=r"(got[u].y), "=r"(got[u].z), "=r"(got[u].w) : "l"(buf + u*ncclSymLLMaxSlots(sizeof(T)))); - } - bool ok = true; - #pragma unroll - for (int u=0; u < divUp(sizeof(T), 8); u++) { - ok &= got[u].y == llEpoch; - ok &= got[u].w == llEpoch; - } - if (__builtin_expect(ok, true)) break; - //if (++spins == 10<<20) { spins=0; printf("r=%d LL spin @ ix=%d got=%d want=%d\n", rank, slot0+me, got[0].y, llEpoch); } - } - union { T val; uint32_t u32[divUp(sizeof(T), 8)][2]; }; - #pragma unroll - for (int u=0; u < divUp(sizeof(T), 8); u++) { - u32[u][0] = got[u].x; - u32[u][1] = got[u].z; - } - dst[slot0 + me] = val; - } + getWorkRange(blockIdx.x, w, indexLo, w, indexHi); + + struct ncclSymkDevWork const& dw = devWork[w]; + + fn(indexHi - indexLo, dw.nElts, + ncclSymPtr(dw.inputWin, dw.inputOff) + indexLo, + ncclSymPtr(dw.outputWin, dw.outputOff) + indexLo); } }; } template typename Red, typename T, bool nvls> -struct ncclSymAccumType { using Type = T; }; +struct ncclSymkAccumType { using Type = T; }; // Only Red's whose opArg is invariant w.r.t. the datatype can have a different // accumulator type. At the moment this excludes integer min/max, sumpostdiv, // and premulsum. -template<> struct ncclSymAccumType { using Type = float; }; +template<> struct ncclSymkAccumType { using Type = float; }; #if defined(__CUDA_BF16_TYPES_EXIST__) -template<> struct ncclSymAccumType { using Type = float; }; +template<> struct ncclSymkAccumType { using Type = float; }; #endif #if defined(__CUDA_FP8_TYPES_EXIST__) -template<> struct ncclSymAccumType { using Type = float; }; -template<> struct ncclSymAccumType { using Type = float; }; +template<> struct ncclSymkAccumType { using Type = float; }; +template<> struct ncclSymkAccumType { using Type = float; }; #endif #endif diff --git a/src/device/symmetric/reduce_scatter.cuh b/src/device/symmetric/reduce_scatter.cuh index 4fd96093e..8f79b3990 100644 --- a/src/device/symmetric/reduce_scatter.cuh +++ b/src/device/symmetric/reduce_scatter.cuh @@ -1,35 +1,36 @@ -#include "symmetric.h" -#include "symmetric/kernel.cuh" -#include "symmetric/primitives.cuh" +#include "sym_kernels.h" +#include "kernel.cuh" +#include "primitives.cuh" template static __device__ void reduceDeep( - ncclSymPrims& prim, int tn, int t, bool waitNeeded, - Red red, char* inputRank0, char* outputHere, int32_t nIters + ncclSymkArgsHandler const& handler, int tn, int t, + bool waitNeeded, ncclLsaBarrierSession& bar, + Red red, ncclSymPtr input, ncclSymPtr output, int32_t nIters ) { using Pack = BytePack; using Acc = typename Red::EltType; using AccPack = BytePack; + ncclTeam world = ncclTeamWorld(handler.comm); int wn = tn/WARP_SIZE; int w = t/WARP_SIZE; int lane = t%WARP_SIZE; - int const& rank = prim.rank; - int const& nRanks = prim.nRanks; - uint32_t const& stride4G = prim.stride4G; - Pack* inpRank0 = (Pack*)inputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane; - Pack* outHere = (Pack*)outputHere + intptr_t(w)*UnrollPacks*WARP_SIZE + lane; + int const& rank = handler.comm.rank; + int const& nRanks = handler.comm.nRanks; + ncclSymPtr inpPacks = (ncclSymPtr)input + intptr_t(w)*UnrollPacks*WARP_SIZE + lane; + ncclSymPtr outPacks = (ncclSymPtr)output + intptr_t(w)*UnrollPacks*WARP_SIZE + lane; Pack acc0[UnrollPacks]; nIters -= w; if (0 < nIters) { #pragma unroll for (int u=0; u < UnrollPacks; u++) { - acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE]; + acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE]; } } - if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed); if (0 < nIters) { while (true) { @@ -39,7 +40,7 @@ static __device__ void reduceDeep( { Pack tmp1[UnrollPacks]; #pragma unroll for (int u=0; u < UnrollPacks; u++) { - tmp1[u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE]; + tmp1[u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE]; } #pragma unroll for (int u=0; u < UnrollPacks; u++) { @@ -65,7 +66,7 @@ static __device__ void reduceDeep( if (partial && ur!=0 && dr+ur == nRanks) break; #pragma unroll UnrollPacks for (int u=0; u < UnrollPacks; u++) { - tmp1[ur][u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE]; + tmp1[ur][u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE]; } r += 1; if (r == nRanks) r = 0; @@ -85,17 +86,17 @@ static __device__ void reduceDeep( for (int u=0; u < UnrollPacks; u++) acc0[u] = applyCast(acc1[u]); #pragma unroll UnrollPacks - for (int u=0; u < UnrollPacks; u++) outHere[u*WARP_SIZE] = acc0[u]; + for (int u=0; u < UnrollPacks; u++) outPacks.localPtr()[u*WARP_SIZE] = acc0[u]; - inpRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE; - outHere += intptr_t(wn)*UnrollPacks*WARP_SIZE; + inpPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE; + outPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE; nIters -= wn; if (nIters <= 0) break; // Load data for next iteration. #pragma unroll for (int u=0; u < UnrollPacks; u++) { - acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE]; + acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE]; } } } @@ -103,20 +104,22 @@ static __device__ void reduceDeep( template static __device__ void reduceEnds( - ncclSymPrims& prim, int tn, int t, Red red, - T* inputRank0, T* outputHere, size_t nElts, uint32_t nPreElts, size_t nSufElts + ncclSymkArgsHandler const& handler, int tn, int t, Red red, + ncclSymPtr input, ncclSymPtr output, + size_t nElts, uint32_t nPreElts, size_t nSufElts ) { using Acc = typename Red::EltType; - int const& rank = prim.rank; - int const& nRanks = prim.nRanks; - uint32_t const& stride4G = prim.stride4G; - BytePack* inpRank0 = (BytePack*)inputRank0; - BytePack* outHere = (BytePack*)outputHere; + ncclTeam world = ncclTeamWorld(handler.comm); + int const& rank = handler.comm.rank; + int const& nRanks = handler.comm.nRanks; + + ncclSymPtr> inpPacks = (ncclSymPtr>)input; + ncclSymPtr> outPacks = (ncclSymPtr>)output; #pragma unroll 1 for (size_t i = t; i < nPreElts+nSufElts; i += tn) { size_t elt = i < nPreElts ? i : nElts-nSufElts-nPreElts+i; - BytePack acc0 = *add4G(inpRank0+elt, rank*stride4G); + BytePack acc0 = inpPacks.peerPtr(world, rank)[elt]; BytePack acc1; BytePack tmp[UnrollPeers]; int dr = 1; @@ -135,7 +138,7 @@ static __device__ void reduceEnds( #pragma unroll for (int u=0; u < UnrollPeers-partial; u++) { if (partial && u!=0 && dr+u == nRanks) break; - tmp[u] = *add4G(inpRank0+elt, r*stride4G); + tmp[u] = inpPacks.peerPtr(world, r)[elt]; r += 1; if (r == nRanks) r = 0; } @@ -152,26 +155,25 @@ static __device__ void reduceEnds( } acc0 = applyCast(acc1); - outHere[elt] = acc0; + outPacks.localPtr()[elt] = acc0; } } template static __device__ void reduce( - ncclSymPrims& prim, int tn, int t, bool waitNeeded, - Red red, T* input, T* output, size_t nElts + ncclSymkArgsHandler const& handler, int tn, int t, int nBlocks, + bool waitNeeded, ncclLsaBarrierSession& bar, + Red red, ncclSymPtr input, ncclSymPtr output, size_t nElts ) { - int nRanks = prim.nRanks; - int nBlocks = prim.nBlocks; - // Mpve input to rank=0 - input = prim.peerPtr(0, input); + int const& nRanks = handler.comm.nRanks; + int const& nRanks_rcp32 = handler.nRanks_rcp32; + uint32_t nBlocks_rcp32 = nccl::utility::idivRcp32_upto64(nBlocks); + uint32_t nRanks_nBlocks_rcp32 = nccl::utility::imulRcp32(nRanks, nRanks_rcp32, nBlocks, nBlocks_rcp32); - uintptr_t inputUptr = reinterpret_cast(input); - uintptr_t outputUptr = reinterpret_cast(output); - uint32_t alignment = uint32_t(inputUptr - outputUptr); + uint32_t alignment = uint32_t(input.offset - output.offset); size_t nBytes = nElts*sizeof(T); - uint32_t nPreBytes = (16u - inputUptr)%16u; + uint32_t nPreBytes = (16u - input.offset)%16u; nPreBytes = min((size_t)nPreBytes, nBytes); uintptr_t cursor = nPreBytes; @@ -181,12 +183,12 @@ static __device__ void reduce( constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2; constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack; uint32_t chunks = (nBytes-cursor)/BytePerChunk; - chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32); + chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32); if (chunks != 0) { uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk; reduceDeep( - prim, tn, t, waitNeeded, red, - (char*)input + cursor, (char*)output + cursor, + handler, tn, t, waitNeeded, bar, red, + (ncclSymPtr)input + cursor, (ncclSymPtr)output + cursor, chunks*MinWarpPerBlock ); cursor = cursorAfter; @@ -198,12 +200,12 @@ static __device__ void reduce( constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4; constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack; uint32_t chunks = (nBytes-cursor)/BytePerChunk; - chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32); + chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32); if (chunks != 0) { uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk; reduceDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers, T>( - prim, tn, t, waitNeeded, red, - (char*)input + cursor, (char*)output + cursor, + handler, tn, t, waitNeeded, bar, red, + (ncclSymPtr)input + cursor, (ncclSymPtr)output + cursor, chunks*MinWarpPerBlock ); cursor = cursorAfter; @@ -211,42 +213,47 @@ static __device__ void reduce( } } - if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed); constexpr int UnrollPeers = 8; size_t nSufElts = (nBytes-cursor)/sizeof(T); - reduceEnds(prim, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts); + reduceEnds(handler, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts); } - template typename Red, typename T> -__device__ __forceinline__ void ncclSymRun_ReduceScatter_LD(ncclSymDevArgs const* args) { - ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier); - Red::Type> red(args->redOpArg); - - // Round robin warps over blocks. - int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE, - prim.block, prim.nBlocks, - threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); - int tn = prim.nBlocks*blockDim.x; - - prim.barrierArrive(ncclCoopCta(), /*release=*/false); - //prim.barrierWait(ncclCoopCta(), /*acquire=*/false); - - reduce(prim, tn, t, /*waitNeeded=*/true, red, (T*)args->input + prim.rank*args->nElts, (T*)args->output, args->nElts); +__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LD(ncclSymkDevWorkArgs const* args) { + ncclSymkArgsHandler handler{args}; + ncclLsaBarrierSession bar{ + ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x + }; + Red::Type> red(handler.devWork->redOpArg); + int const& rank = handler.comm.rank; + + bar.arrive(ncclCoopCta(), cuda::memory_order_relaxed); + + bool waitNeeded = true; + handler.forEachWork( + [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts, + ncclSymPtr input, ncclSymPtr output) { + // Round robin warps over blocks. + int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE, + block, nBlocks, + threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); + int tn = nBlocks*blockDim.x; + + reduce(handler, tn, t, nBlocks, waitNeeded, bar, red, input + rank*nElts, output, nElts); + + waitNeeded = false; + } + ); - prim.barrierArrive(ncclCoopCta(), /*release=*/false); - prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + bar.sync(ncclCoopCta(), cuda::memory_order_relaxed); } - template static __device__ void reduceMultimem( - ncclSymPrims& prim, int tn, int t, Red red, T* input, T* output, size_t nElts + int tn, int t, Red red, T* input, T* output, size_t nElts ) { - // Mpve input to multimem - input = prim.multimemPtr(input); - uintptr_t inputUptr = reinterpret_cast(input); uintptr_t outputUptr = reinterpret_cast(output); size_t nBytes = nElts*sizeof(T); @@ -291,41 +298,52 @@ static __device__ void reduceMultimem( uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes); BytePack val = applyLoadMultimem(red, inputUptr + cursor); *reinterpret_cast*>(outputUptr + cursor) = val; - cursor += tn*sizeof(T); } } template typename Red, typename T> -__device__ __forceinline__ void ncclSymRun_ReduceScatter_LDMC(ncclSymDevArgs const* args) { - ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem); - Red::Type> red(args->redOpArg); - - // Round robin warps over blocks. - int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE, - prim.block, prim.nBlocks, - threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); - int tn = prim.nBlocks*blockDim.x; - - prim.barrierArrive(ncclCoopCta(), /*release=*/false); - prim.barrierWait(ncclCoopCta(), /*acquire=*/false); - - reduceMultimem(prim, tn, t, red, (T*)args->input + prim.rank*args->nElts, (T*)args->output, args->nElts); +__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LDMC(ncclSymkDevWorkArgs const* args) { + ncclSymkArgsHandler handler{args}; + ncclLsaBarrierSession bar{ + ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x, /*multimem=*/true + }; + Red::Type> red(handler.devWork->redOpArg); + + int const& rank = handler.comm.rank; + auto const& multimem = handler.comm.lsaMultimem; + + bar.sync(ncclCoopCta(), cuda::memory_order_relaxed); + + handler.forEachWork( + [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts, + ncclSymPtr input, ncclSymPtr output) { + // Round robin warps over blocks. + int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE, + block, nBlocks, + threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); + int tn = nBlocks*blockDim.x; + + reduceMultimem(tn, t, red, input.multimemPtr(multimem) + rank*nElts, output.localPtr(), nElts); + } + ); - prim.barrierArrive(ncclCoopCta(), /*release=*/false); - prim.barrierWait(ncclCoopCta(), /*acquire=*/false); + bar.sync(ncclCoopCta(), cuda::memory_order_relaxed); } // T is user type, EltType is the most aligned type template -__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL_body( - ncclSymPrims &prim, Red red, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts) { +__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LL_body( + ncclSymkArgsHandler& handler, ncclLLA2ASession& lla2a, + Red red, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts) { using Pack = BytePack<8>; + using Acc = typename Red::EltType; + using AccPack = BytePack<8*sizeof(Acc)/sizeof(T)>; constexpr int EltPerPack = 8/sizeof(EltType); - int nRanks = prim.nRanks; - int rank = prim.rank; + int const& nRanks = handler.comm.nRanks; + int const& rank = handler.comm.rank; int t = threadIdx.x; - int tn = ncclSymMaxThreads; + constexpr int tn = ncclSymkMaxThreads; ncclCoopCta cta; #pragma unroll 1 @@ -339,17 +357,25 @@ __device__ __forceinline__ void ncclSymRun_ReduceScatter_LL_body( #pragma unroll 1 for (int i = t; i < nRanks*nIterPacks; i += tn) { Pack got = loadPack(input + peer*nStrideElts, pack*EltPerPack, nElts); - prim.sendLL(peer, rank*nIterPacks + pack, got); + lla2a.send(peer, rank*nIterPacks + pack, got); peer += tn_div_nPacks; pack += tn_mod_nPacks; if (nIterPacks <= pack) { peer += 1; pack -= nIterPacks; } } if (t < nIterPacks) { - Pack got = prim.template recvReduceLL(t, nIterPacks, red); - storePack(output, t*EltPerPack, nElts, got); + AccPack got = lla2a.template recvReduce( + /*slotStart=*/t, /*slotCount=*/nRanks, /*slotStride=*/nIterPacks, + /*eltToAcc=*/[&] __device__ (Pack x)->AccPack { + return applyCast(x); + }, + /*reduce=*/[&] __device__ (AccPack a, AccPack b)->AccPack { + return applyReduce(red, a, b); + } + ); + storePack(output, t*EltPerPack, nElts, applyCast(got)); } - prim.endLL(cta); + lla2a.endEpoch(cta); input += tn*EltPerPack; output += tn*EltPerPack; @@ -357,31 +383,34 @@ __device__ __forceinline__ void ncclSymRun_ReduceScatter_LL_body( nPacks -= tn; } } -template typename Red, typename T> -__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL(ncclSymDevArgs const* args) { - ncclSymPrims prim(args->comm, ncclSymPrims_UseLL); - Red::Type> red(args->redOpArg); +template typename Red, typename T> +__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LL(ncclSymkDevWorkArgs const* args) { + ncclSymkArgsHandler handler{args}; + ncclLLA2ASession lla2a( + ncclCoopCta(), handler.comm, ncclTeamLsa(handler.comm), handler.lsaLLA2A, blockIdx.x, ncclSymkMaxThreads + ); + Red::Type> red(handler.devWork->redOpArg); using Pack = BytePack<8>; constexpr int EltPerPack = 8/sizeof(T); - int nAllElts = args->nElts; - int nAllPacks = divUp(nAllElts, EltPerPack); - uint32_t nPackPerBlock, nPackModBlock; - idivmodFast32(&nPackPerBlock, &nPackModBlock, nAllPacks, prim.nBlocks, prim.nBlocks_rcp32); - int blockPackBegin = prim.block*nPackPerBlock + minval(prim.block, nPackModBlock); - int blockPackEnd = blockPackBegin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0); - int nPacks = blockPackEnd - blockPackBegin; - int nElts = nAllElts - blockPackBegin*EltPerPack; - nElts = min(nElts, nPacks*EltPerPack); - T* input = (T*)args->input + blockPackBegin*EltPerPack; - T* output = (T*)args->output + blockPackBegin*EltPerPack; - - uint32_t lowBits = args->nElts*sizeof(T); - lowBits |= (uint32_t)reinterpret_cast(args->input); - lowBits |= (uint32_t)reinterpret_cast(args->output); - if (__builtin_expect(lowBits%8 == 0, true)) { - ncclSymRun_ReduceScatter_LL_body(prim, red, (Pack*)input, (Pack*)output, nPacks, nPacks, nAllElts/EltPerPack); - } else { - ncclSymRun_ReduceScatter_LL_body(prim, red, input, output, nElts, nPacks, nAllElts); - } + + handler.singleWork( + [&]__device__(int nElts, int nAllElts, + ncclSymPtr inputPtr, ncclSymPtr outputPtr) { + int nPacks = divUp(nElts, EltPerPack); + + T* input = (T*)inputPtr.localPtr(); + T* output = (T*)outputPtr.localPtr(); + + uint32_t lowBits = nElts*sizeof(T); + lowBits |= (uintptr_t)input; + lowBits |= (uintptr_t)output; + if (__builtin_expect(lowBits%8 == 0, true)) { + ncclSymkRun_ReduceScatter_LL_body(handler, lla2a, red, (Pack*)input, (Pack*)output, + nPacks, nPacks, divUp(nAllElts, EltPerPack)); + } else { + ncclSymkRun_ReduceScatter_LL_body(handler, lla2a, red, input, output, nElts, nPacks, nAllElts); + } + } + ); } diff --git a/src/enqueue.cc b/src/enqueue.cc index 225a4cffc..00a0ef8da 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -14,6 +14,9 @@ #include "profiler.h" #include "transport.h" #include "register_inline.h" +#include "ce_coll.h" +#include "nvtx.h" +#include "scheduler.h" #include // std::memcpy #include // PRIx64 @@ -30,8 +33,8 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma int ncclMaxSharedMem = ncclShmemDynamicSize(cudaArch); for (int sym=0; sym <= 1; sym++) { - int kcount = sym==0 ? ncclDevKernelCount : ncclSymKernelCount; - void* const* kptrs = sym==0 ? ncclDevKernelList : ncclSymKernelList; + int kcount = sym==0 ? ncclDevKernelCount : ncclSymkKernelCount; + void* const* kptrs = sym==0 ? ncclDevKernelList : ncclSymkKernelList; for (int k=0; k < kcount; k++) { void* fn = kptrs[k]; cudaFuncAttributes attr = {0}; @@ -164,6 +167,7 @@ static void finishPlan(struct ncclComm* comm, struct ncclKernelPlan* plan) { size_t workBytes = plan->workBytes; size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch); + if (plan->isSymColl) return; plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MIN_NTHREADS); // If we can fit everything into the kernel args we do so. @@ -263,7 +267,6 @@ static bool testBudget( ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) { struct ncclKernelPlanner* planner = &comm->planner; - if (planner->isSymColl) return ncclSuccess; struct ncclTaskColl *task; task = ncclIntruQueueHead(&planner->collTaskQueue); while (task != nullptr) { @@ -328,6 +331,7 @@ ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) { ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo) { struct ncclKernelPlanner* planner = &comm->planner; planner->persistent = ncclCudaGraphValid(planner->capturingGraph); + // Tasks from the sorter come out ordered size descending. struct ncclTaskColl* task = ncclTaskCollSorterDequeueAll(&planner->collSorter); // Tasks are assembled by (fn,op,ty) size ascending. @@ -336,36 +340,8 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool int fnOpTyIndices[ncclNumFuncs*ncclNumDevRedOps*ncclNumTypes]; int fnOpTyCount = 0; - if (comm->nNodes == 1 && planner->nTasksColl == 1 && planner->nTasksP2p == 0) { - void* sendSymPtr; - void* recvSymPtr; - struct ncclReg* sendReg; - struct ncclReg* recvReg; - size_t size = task->count*ncclTypeSize(task->datatype); - NCCLCHECK(ncclRegFindSymmetric(comm, task->sendbuff, size, &sendSymPtr, &sendReg)); - NCCLCHECK(ncclRegFindSymmetric(comm, task->recvbuff, size, &recvSymPtr, &recvReg)); - bool implemented = ncclSymImplemented(task->func, task->opDev.op, task->datatype); - - if (sendReg && recvReg && (sendReg->winFlags & recvReg->winFlags & NCCL_WIN_COLL_SYMMETRIC) && implemented) { - enum ncclSymKernelId kernel; - int nChannels, nWarps; - float estTimeUs = 1.e18; - NCCLCHECK(ncclSymPickKernel(comm, task->func, task->opDev.op, task->datatype, task->count, &estTimeUs, &kernel, &nChannels, &nWarps)); - - // We should only use symmetric kernel if it beats the asymmetric kernel. But the - // perf model accuracy from asymmetric kernels is too inaccurate and reports too high - // of a bandwidth. For now just always use symmetric if available. - if (kernel != ncclSymKernelId_Count) { - task->sendbuff = sendSymPtr; - task->recvbuff = recvSymPtr; - task->devFuncId = (int)kernel; - task->nMaxChannels = nChannels; - task->nWarps = nWarps; - ncclIntruQueueEnqueue(&planner->collTaskQueue, task); - planner->isSymColl = true; - return ncclSuccess; - } - } + if (comm->symmetricSupport) { + NCCLCHECK(ncclMakeSymmetricTaskList(comm, task, &planner->collSymTaskQueue, &task)); } // Walk the size sorted tasks, binning them by (fn,op,ty). @@ -532,7 +508,7 @@ static ncclResult_t scheduleCollTasksToPlan( size_t trafficBytes[2*2] = {0, 0, 0, 0}; // [collnet][nvls] int nChannels[2*2] = {0, 0, 0, 0}; // [collnet][nvls] int const nMaxChannels[2*2] = {comm->nChannels, comm->nvlsChannels, // [collnet][nvls] - comm->nChannels, comm->nvlsChannels}; + comm->nChannels, std::min(comm->nChannels, comm->nvlsChannels)}; constexpr size_t MinTrafficPerChannel = 16 << 10; // 16K traffic as minimal do { size_t workBytes = 0; @@ -725,6 +701,7 @@ static ncclResult_t scheduleCollTasksToPlan( } proxyOp->eActivationMask = task->eActivationMask; proxyOp->incWorkCounter = true; + proxyOp->nChannels = nChannels; addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes); // Coverity reports "proxyOp->connection" as being possibly uninitialized. It's hard to // determine if that's actually true but it's also not clear if that would be an issue. @@ -740,6 +717,8 @@ static ncclResult_t scheduleCollTasksToPlan( plan->kernelFn = ncclDevKernelForFunc[task->devFuncId]; plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[task->devFuncId]; } + // Profiler + plan->groupApiEventHandle = task->groupApiEventHandle; if (comm->rank == 0) { INFO(NCCL_TUNING, "%s: %ld Bytes -> Algo %s proto %s channel{Lo..Hi}={%d..%d}", @@ -792,8 +771,9 @@ static ncclResult_t addP2pToPlan( int nChannelsMin, int nChannelsMax, int p2pRound, int sendRank, void* sendAddr, ssize_t sendBytes, int recvRank, void* recvAddr, ssize_t recvBytes, - struct ncclTaskP2p** p2pTasks + const int planTotalTasks[], struct ncclTaskP2p** p2pTasks ) { + ncclResult_t ret = ncclSuccess; constexpr int connIndex = 1; bool selfSend = (sendRank == comm->rank); // recv: dir=0, send: dir=1 @@ -804,6 +784,8 @@ static ncclResult_t addP2pToPlan( bool proxySameProcess[2] = {true, true}; void** handles[2] = {NULL, NULL}; uint8_t base = ncclP2pChannelBaseForRound(comm, p2pRound); + struct ncclProxyOp proxyOps[2] = {}; + int nProxyOps = selfSend ? 0 : 2; if (!selfSend) { for (int part=0; part < nChannelsMax; part++) { int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part); @@ -857,7 +839,7 @@ static ncclResult_t addP2pToPlan( bool pxnUsed = !ncclPxnDisable(comm) && comm->isAllNvlink && comm->maxLocalRanks > 1; if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (!pxnUsed)) { int regFlag = 0; - NCCLCHECK(ncclCalloc(&handles[dir], nChannelsMax)); + NCCLCHECKGOTO(ncclCalloc(&handles[dir], nChannelsMax), ret, cleanup); for (int part = 0; part < nChannelsMax; part++) { int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part); struct ncclChannelPeer** channelPeers = comm->channels[channelId].peers; @@ -880,7 +862,7 @@ static ncclResult_t addP2pToPlan( void* regAddr = NULL; if (conn->conn.flags & (NCCL_P2P_WRITE | NCCL_P2P_READ)) { // We require users registering buffers on both sides - NCCLCHECK(ncclRegisterP2pIpcBuffer(comm, addrs[dir], bytes[dir], peerRank, ®Flag, ®Addr, &plan->cleanupQueue)); + NCCLCHECKGOTO(ncclRegisterP2pIpcBuffer(comm, addrs[dir], bytes[dir], peerRank, ®Flag, ®Addr, &plan->cleanupQueue), ret, cleanup); if (regFlag) { if (dir == 0 && (conn->conn.flags & NCCL_P2P_WRITE)) recvAddr = regAddr; else if (dir == 1 && (conn->conn.flags & NCCL_P2P_READ)) sendAddr = regAddr; @@ -905,14 +887,17 @@ static ncclResult_t addP2pToPlan( if (p2pTasks[dir]) p2pTasks[dir]->nChannels = nChannels[dir]; } - struct ncclWorkList* workNode = ncclMemoryStackAllocInlineArray(&comm->memScoped, 1); + struct ncclWorkList* workNode; + workNode = ncclMemoryStackAllocInlineArray(&comm->memScoped, 1); workNode->workType = ncclDevWorkTypeP2p; workNode->size = sizeof(struct ncclDevWorkP2p); ncclIntruQueueEnqueue(&plan->workQueue, workNode); - uint32_t workOffset = plan->workBytes; + uint32_t workOffset; + workOffset = plan->workBytes; plan->workBytes += sizeof(struct ncclDevWorkP2p); - struct ncclDevWorkP2p* work = (struct ncclDevWorkP2p*)(workNode+1); + struct ncclDevWorkP2p* work; + work = (struct ncclDevWorkP2p*)(workNode+1); work->nP2pChannels = comm->p2pnChannels; work->channelBase = base; work->nSendChannels = nChannels[1]; @@ -933,8 +918,6 @@ static ncclResult_t addP2pToPlan( work->recvBytes = recvBytes==-1 ? 0 : recvBytes; work->profilerEnabled = ncclProfilerPluginLoaded() && ((p2pTasks[0] ? p2pTasks[0] : p2pTasks[1])->eActivationMask & ncclProfileKernelCh); - struct ncclProxyOp proxyOps[2] = {}; - int nProxyOps = selfSend ? 0 : 2; for (int dir=0; dir < nProxyOps; dir++) { struct ncclProxyOp* op = &proxyOps[dir]; op->root = dir ? sendRank : recvRank; @@ -947,6 +930,7 @@ static ncclResult_t addP2pToPlan( op->chunkSize = chunkSize[dir]; op->reg = netRegistered[dir]; op->coll = p2pTasks[dir] ? p2pTasks[dir]->func : 0; + op->collAPI = p2pTasks[dir] ? p2pTasks[dir]->collAPI : 0; op->task.p2p = p2pTasks[dir]; op->rank = comm->rank; op->eActivationMask = p2pTasks[dir] ? p2pTasks[dir]->eActivationMask : 0; @@ -955,6 +939,15 @@ static ncclResult_t addP2pToPlan( } nChannelsMax = std::max(nChannels[0], nChannels[1]); + // Determine how many peers this plan will target concurrently. Make a + // simplifying assumption that each task targets a different peer. + // Each task is striped across 'nChannelsMax' of 'p2pnChannels' channels. + // Each channel runs up to NCCL_MAX_DEV_WORK_P2P_PER_BATCH tasks concurrently. + int maxConcurrent; + int concurrentTasks[2]; + maxConcurrent = comm->p2pnChannels / nChannelsMax * NCCL_MAX_DEV_WORK_P2P_PER_BATCH; + concurrentTasks[0] = std::min(planTotalTasks[0], maxConcurrent); + concurrentTasks[1] = std::min(planTotalTasks[1], maxConcurrent); for (int part=0; part < nChannelsMax; part++) { int incWorkCounter = -1; int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part); @@ -1003,13 +996,17 @@ static ncclResult_t addP2pToPlan( // equal one plus the batch index this p2p settled in. proxyOps[dir].channelId = channelId; proxyOps[dir].opCount = uint64_t(comm->planner.wipPlan.channels[channelId].nWorkBatchesP2p)<<1 | 1; - NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOps[dir])); - NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOps[dir])); + proxyOps[dir].nChannels = nChannels[dir]; + proxyOps[dir].nPeers = concurrentTasks[dir]; + NCCLCHECKGOTO(addProxyOpIfNeeded(comm, plan, &proxyOps[dir]), ret, cleanup); + NCCLCHECKGOTO(addProfilerProxyOpIfNeeded(comm, plan, &proxyOps[dir]), ret, cleanup); } } } - - return ncclSuccess; +cleanup: + free(handles[0]); + free(handles[1]); + return ret; } static int calcP2pChannelCount(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) { @@ -1041,6 +1038,8 @@ static ncclResult_t scheduleP2pTasksToPlan( // Try to use all channels, but one channel per operation. while (nChannelsMin*nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2; + // Save the total count of send/recv tasks in the plan + int planTotalTasks[2] = {comm->planner.nTasksP2pRecv, comm->planner.nTasksP2pSend}; while (comm->planner.nTasksP2p != 0) { for (int round=0; round < nRanks; round++) { int sendRank = comm->p2pSchedule[round].sendRank; @@ -1071,22 +1070,30 @@ static ncclResult_t scheduleP2pTasksToPlan( ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, send); ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, recv); comm->planner.nTasksP2p -= 2; + comm->planner.nTasksP2pSend -= 1; + comm->planner.nTasksP2pRecv -= 1; } else { // Ensure room for worst case of one new batch per channel. if (!testBudget(budget, plan->nWorkBatches+nChannelsMax, plan->workBytes + sizeof(struct ncclDevWorkP2p))) { return ncclSuccess; } struct ncclTaskP2p* p2pTasks[2] = { recv, send }; - NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes, p2pTasks)); + NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes, planTotalTasks, p2pTasks)); if (send != nullptr) { ncclIntruQueueDequeue(&peers[sendRank].sendQueue); + // Profiler - We can overwrite groupAPI event handles here since all operations here belong to the same group + plan->groupApiEventHandle = send->groupApiEventHandle; ncclIntruQueueEnqueue(&plan->p2pTaskQueue, send); comm->planner.nTasksP2p -= 1; + comm->planner.nTasksP2pSend -= 1; } if (recv != nullptr) { ncclIntruQueueDequeue(&peers[recvRank].recvQueue); + // Profiler - We can overwrite groupAPI event handles here since all operations here belong to the same group + plan->groupApiEventHandle = recv->groupApiEventHandle; ncclIntruQueueEnqueue(&plan->p2pTaskQueue, recv); comm->planner.nTasksP2p -= 1; + comm->planner.nTasksP2pRecv -= 1; } } } @@ -1125,7 +1132,7 @@ namespace { } static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* plan) { - if (plan->isSymColl) return ncclSuccess; + if (plan->isSymColl || plan->isCeColl) return ncclSuccess; size_t workBytes = plan->workBytes; size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch); @@ -1297,7 +1304,7 @@ static ncclResult_t hostStreamPlanTask(struct ncclComm* comm, struct ncclKernelP } static void CUDART_CB hostStreamPlanCallback(void *plan_) { - NVTX3_FUNC_RANGE_IN(nccl_domain); + NCCL_NVTX3_FUNC_RANGE; struct ncclKernelPlan* plan = (struct ncclKernelPlan*)plan_; ncclResult_t result = hostStreamPlanTask(plan->comm, plan); if (result != ncclSuccess) { @@ -1318,6 +1325,9 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); } } + if (plan->isSymColl) { + free(plan->kernelSymArgs); + } // Free coll tasks struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue); while (ct != nullptr) { @@ -1394,7 +1404,9 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { planner->persistent = persistent; int nPlans = 0; - if (planner->nTasksColl + planner->nTasksP2p != 0) { + if (planner->nTasksColl + planner->nTasksP2p != 0 || + !ncclIntruQueueEmpty(&planner->collSymTaskQueue) || + !ncclIntruQueueEmpty(&planner->collCeTaskQueue)) { do { memset(&planner->wipPlan, 0, sizeof(planner->wipPlan)); @@ -1406,53 +1418,55 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { plan->workStorageType = persistent ? ncclDevWorkStorageTypePersistent : ncclDevWorkStorageTypeFifo; - if (planner->isSymColl) { - plan->workStorageType = ncclDevWorkStorageTypeArgs; - - struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collTaskQueue); - plan->isSymColl = true; - plan->kernelFn = ncclSymGetKernelPtr((ncclSymKernelId)task->devFuncId, task->opDev.op, task->datatype); - plan->threadPerBlock = task->nWarps*WARP_SIZE; - plan->channelMask = uint64_t(-1) >> (64-task->nMaxChannels); - - plan->kernelArgsSize = sizeof(struct ncclSymDevArgs); - plan->kernelSymArgs = ncclMemoryStackAlloc(&comm->memScoped); - plan->kernelSymArgs->comm = comm->symDevComm; - plan->kernelSymArgs->rootRank = task->root; - plan->kernelSymArgs->redOpArg = task->opDev.scalarArg; - plan->kernelSymArgs->nElts = task->count; - plan->kernelSymArgs->input = (char*)task->sendbuff; - plan->kernelSymArgs->output = (char*)task->recvbuff; - - planner->nTasksColl -= 1; + if (!ncclIntruQueueEmpty(&planner->collCeTaskQueue)) { + struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collCeTaskQueue); + plan->isCeColl = true; + plan->ceCollArgs = ncclMemoryStackAlloc(&comm->memScoped); + plan->ceCollArgs->rootRank = task->root; + plan->ceCollArgs->nElts = task->count; + plan->ceCollArgs->eltSize = ncclTypeSize(task->datatype); + plan->ceCollArgs->sendBuff = (uint8_t*)task->sendbuff; + plan->ceCollArgs->recvBuff = (uint8_t*)task->recvbuff; + plan->ceCollArgs->func = task->func; + plan->ceCollArgs->sendWin = task->sendWin; + plan->ceCollArgs->recvWin = task->recvWin; + ncclIntruQueueEnqueue(&planner->planQueue, plan); - INFO(NCCL_TUNING, "%s [Symmetric]: %ld Bytes -> Kernel %s nchannels %d nthreads %d", - ncclFuncToString(task->func), task->count * ncclTypeSize(task->datatype), ncclSymKernelIdToString(task->devFuncId), task->nMaxChannels, plan->threadPerBlock); + ncclIntruQueueDequeue(&planner->collCeTaskQueue); + ncclMemoryPoolFree(&comm->memPool_ncclTaskColl, task); nPlans += 1; } else { - struct ncclKernelPlanBudget budget; - budget.inArgsBytes = comm->workArgsBytes - sizeof(struct ncclDevKernelArgs); - // Non-persistent kernels fill up at most half of our fifo per kernel. - budget.outArgsBytes = plan->persistent ? (1<<30) : comm->workFifoBytes/2; - - // Drain coll tasks first. This is essential since we partition tasks based - // on the work budget and p2p work isn't collective. If we were to drain p2p - // first, the place where we cut the kernel could vary by rank which would - // cause the "shortest channel first" channel picker to have divergent results. - if (planner->nTasksColl != 0) { - NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &budget), result, failure); + if (!ncclIntruQueueEmpty(&planner->collSymTaskQueue)) { + NCCLCHECKGOTO(ncclSymmetricTaskScheduler(comm, &planner->collSymTaskQueue, plan), result, failure); } - // And only drain p2p tasks once colls are depleted. - if (planner->nTasksColl == 0 && planner->nTasksP2p != 0) { - NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &budget), result, failure); + else { + struct ncclKernelPlanBudget budget; + budget.inArgsBytes = comm->workArgsBytes - sizeof(struct ncclDevKernelArgs); + // Non-persistent kernels fill up at most half of our fifo per kernel. + budget.outArgsBytes = plan->persistent ? (1<<30) : comm->workFifoBytes/2; + + // Drain coll tasks first. This is essential since we partition tasks based + // on the work budget and p2p work isn't collective. If we were to drain p2p + // first, the place where we cut the kernel could vary by rank which would + // cause the "shortest channel first" channel picker to have divergent results. + if (planner->nTasksColl != 0) { + NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &budget), result, failure); + } + // And only drain p2p tasks once colls are depleted. + if (planner->nTasksColl == 0 && planner->nTasksP2p != 0) { + NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &budget), result, failure); + } } + finishPlan(comm, plan); if (plan->workBytes != 0) { ncclIntruQueueEnqueue(&planner->planQueue, plan); nPlans += 1; } } - } while (planner->nTasksColl + planner->nTasksP2p != 0); + } while (planner->nTasksColl + planner->nTasksP2p != 0 || + !ncclIntruQueueEmpty(&planner->collSymTaskQueue) || + !ncclIntruQueueEmpty(&planner->collCeTaskQueue)); struct ncclKernelPlan* planHead = ncclIntruQueueHead(&planner->planQueue); planner->unlaunchedPlansHead = planHead; @@ -1531,8 +1545,6 @@ ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, stru NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote); #endif -NCCL_PARAM(NvlinkUtilCentricSchedEnable, "NVLINK_UTIL_CENTRIC_SCHED_ENABLE", 0); - ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) { ncclResult_t ret = ncclSuccess; struct ncclKernelPlanner* planner = &comm->planner; @@ -1542,6 +1554,9 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan dim3 block = {(unsigned)plan->threadPerBlock, 1, 1}; int smem = ncclShmemDynamicSize(comm->cudaArch); cudaStream_t launchStream = planner->streams->stream; + + NCCLCHECK(ncclProfilerStartKernelLaunchEvent(plan, launchStream)); + void* extra[] = { CU_LAUNCH_PARAM_BUFFER_POINTER, plan->kernelArgs, CU_LAUNCH_PARAM_BUFFER_SIZE, &plan->kernelArgsSize, @@ -1588,25 +1603,24 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan } #endif #if CUDART_VERSION >= 12030 - bool capturing = ncclCudaGraphValid(planner->capturingGraph); enum ncclImplicitOrder implicitOrder; - NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing, driverVersion), ret, do_return); + NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, plan->persistent, driverVersion), ret, do_return); if (implicitOrder == ncclImplicitOrderLaunch) { launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT; launchAttrs[attrs].value.launchCompletionEvent.event = comm->sharedRes->launchEvent; launchAttrs[attrs].value.launchCompletionEvent.flags = 0; attrs++; } - if (comm->planner.isSymColl && compCap >= 90 && driverVersion >= 12030) { + if (plan->isSymColl && compCap >= 90 && driverVersion >= 12030) { launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION; launchAttrs[attrs].value.programmaticStreamSerializationAllowed = 1; attrs++; } #endif #if CUDART_VERSION >= 13000 - if (compCap >= 90 && driverVersion >= 13000) { + if (compCap >= 100 && driverVersion >= 13000) { launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING; - launchAttrs[attrs].value.nvlinkUtilCentricScheduling = ncclParamNvlinkUtilCentricSchedEnable(); + launchAttrs[attrs].value.nvlinkUtilCentricScheduling = comm->config.nvlinkCentricSched; attrs++; } #endif @@ -1628,6 +1642,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan } do_return: + NCCLCHECK(ncclProfilerStopKernelLaunchEvent(plan)); return ret; } @@ -1765,6 +1780,8 @@ static ncclResult_t updateCollCostTable( if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue; // CollNetDirect is only supported for up to 8 local GPUs if (a == NCCL_ALGO_COLLNET_DIRECT && comm->maxLocalRanks > NCCL_MAX_DIRECT_ARITY+1) continue; + // Disable CollNet Chain for more than 8 local GPUs + if (a == NCCL_ALGO_COLLNET_CHAIN && comm->maxLocalRanks > NCCL_MAX_DIRECT_ARITY+1) continue; if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && (!nvlsSupport || (info->func != ncclFuncAllReduce && comm->localRanks > NCCL_MAX_NVLS_ARITY))) continue; if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue; /* Tree reduceScatter doesn't support scaling yet */ @@ -1844,7 +1861,11 @@ static ncclResult_t topoGetAlgoInfo( } } else if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) { // NVLS should not need more than 16 channels to get peak BW. - nc = comm->nvlsChannels; + if (comm->nNodes > 1 && info->algorithm == NCCL_ALGO_NVLS) { + nc = std::min(comm->nvlsChannels, comm->nChannels); + } else { + nc = comm->nvlsChannels; + } } else { // Ring/Tree channel tuning while (nBytes < nc * nt * threadThreshold) { @@ -2107,6 +2128,7 @@ static ncclResult_t calcCollChunking( } proxyOp->pattern = pattern; proxyOp->coll = info->func; + proxyOp->collAPI = info->func; proxyOp->root = info->root; proxyOp->isOneRPN = comm->isOneRPN; // This is used by P2P to reduce the receive buffer size. We don't use it in collectives @@ -2170,6 +2192,35 @@ static ncclResult_t calcCollChunking( proxyOp->nbytes = DIVUP(nBytes, nChannels); } + // Set peer count hints used by network plugin + switch (proxyOp->pattern) { + case ncclPatternRing: + case ncclPatternRingTwice: + case ncclPatternPipelineFrom: + case ncclPatternPipelineTo: + case ncclPatternPatUp: + case ncclPatternPatDown: + proxyOp->nPeers = 1; + break; + case ncclPatternTreeUp: + case ncclPatternTreeDown: + case ncclPatternTreeUpDown: + case ncclPatternNvlsTree: + proxyOp->nPeers = (NCCL_MAX_TREE_ARITY - 1) * 2; + break; + case ncclPatternCollnetChain: + case ncclPatternCollnetDirect: + case ncclPatternNvls: + case ncclPatternProfiler: + // Peer count hints unused + break; + case ncclPatternSend: + case ncclPatternRecv: + default: + WARN("Unknown pattern %d", pattern); + return ncclInternalError; + } + *outChunkSize = proxyOp->chunkSize; return ncclSuccess; } @@ -2269,70 +2320,225 @@ static ncclResult_t hostToDevRedOp( return ncclSuccess; } -// Converts `info` to a task and adds it to `comm->planner`. The exception is with -// single rank communicators, collectives are issued as `ncclMemcpyAsync`s and -// thus don't need a task. -static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { +static ncclResult_t ncclPlannerSetCapturingGraph(struct ncclComm* comm, struct ncclInfo* info) { struct ncclKernelPlanner *planner = &comm->planner; - - if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) { - int peer = info->root; - ssize_t nBytes = info->count*ncclTypeSize(info->datatype); - bool isSendNotRecv = info->coll == ncclFuncSend; - - // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`. - ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective); - struct ncclTaskP2p* p2p = ncclMemoryPoolAlloc(&comm->memPool_ncclTaskP2p, &comm->memPermanent); - p2p->func = info->coll; - p2p->buff = (void*)info->recvbuff; - p2p->count = info->count; - p2p->datatype = info->datatype; - p2p->root = info->root; - p2p->bytes = nBytes; - p2p->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED); - ncclIntruQueueEnqueue( - isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue, - p2p); - planner->nTasksP2p += 1; - - // Mark channels that need pre-connect - if (comm->rank != peer) { - if (!(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen)) { - // planner->peers[peer].send/recvSeen is private to each comm, so we need to set it anyway. - (isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen) = true; - int round = 0; - while (peer != (isSendNotRecv ? comm->p2pSchedule[round].sendRank - : comm->p2pSchedule[round].recvRank)) { - round += 1; + if (info->stream != planner->streamRecent || planner->streams == nullptr) { + planner->streamRecent = info->stream; + struct ncclCudaStreamList* l = planner->streams; + while (true) { + if (l == nullptr) { // Got to the end, this must be a new stream. + struct ncclCudaGraph graph; + NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream)); + if (planner->streams != nullptr && !ncclCudaGraphSame(planner->capturingGraph, graph)) { + WARN("Streams given to a communicator within a NCCL group must either be all uncaptured or all captured by the same graph."); + return ncclInvalidUsage; } - uint8_t base = ncclP2pChannelBaseForRound(comm, round); - for (int c=0; c < comm->p2pnChannelsPerPeer; c++) { - int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, c); - if (isSendNotRecv) { - if (comm->channels[channelId].peers[peer]->send[1].hasSeen == 0) { // P2P uses only 1 connector - // the send/recv connector is shared among split shared comms. We need to set hasSeen to - // 1 in order to avoid duplicate connection setup if user group sendrecv ops with split - // shared comms together. - comm->channels[channelId].peers[peer]->send[1].hasSeen = 1; - comm->connectSend[peer] |= (1UL<channels[channelId].peers[peer]->recv[1].hasSeen == 0) { // P2P uses only 1 connector - comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1; - comm->connectRecv[peer] |= (1UL<capturingGraph = graph; // C++ struct assignment + // Add stream to list + l = ncclMemoryStackAlloc(&comm->memScoped); + l->stream = info->stream; + l->next = planner->streams; + planner->streams = l; + break; + } + if (l->stream == info->stream) + break; // Already seen stream. + l = l->next; + } + } + return ncclSuccess; +} + +static ncclResult_t p2pTaskAppend( + struct ncclComm* comm, + struct ncclInfo* info, + ncclFunc_t coll, + ncclFunc_t collAPI, + void* buff, + size_t count, + ncclDataType_t datatype, + int peer) { + struct ncclKernelPlanner *planner = &comm->planner; + + // Determine peer and basic parameters. + ssize_t nBytes = count*ncclTypeSize(datatype); + bool isSendNotRecv = coll == ncclFuncSend; + + // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`. + ncclGroupCommJoin(comm, ncclGroupTaskTypeCollective); + info->coll = coll; + // Set capturing graph. Called here so that profiler can emit a group API event with this information + NCCLCHECK(ncclPlannerSetCapturingGraph(comm, info)); + bool isGraphCaptured = ncclCudaGraphValid(planner->capturingGraph); + NCCLCHECK(ncclProfilerStartGroupApiEvent(info, isGraphCaptured)); + NCCLCHECK(ncclProfilerRecordGroupApiEventState(ncclProfilerGroupStartApiStop)); + + NCCLCHECK(ncclProfilerStartP2pApiEvent(info, isGraphCaptured)); + + struct ncclTaskP2p* p2p = ncclMemoryPoolAlloc(&comm->memPool_ncclTaskP2p, &comm->memPermanent); + p2p->func = coll; + p2p->collAPI = collAPI; + p2p->buff = buff; + p2p->count = count; + p2p->datatype = datatype; + p2p->root = peer; + p2p->bytes = nBytes; + p2p->eActivationMask = ncclProfilerApiState.eActivationMask; + p2p->groupApiEventHandle = ncclProfilerApiState.groupApiEventHandle; + p2p->p2pApiEventHandle = ncclProfilerApiState.p2pApiEventHandle; + ncclIntruQueueEnqueue( + isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue, + p2p); + planner->nTasksP2p += 1; + if (isSendNotRecv) + planner->nTasksP2pSend += 1; + else + planner->nTasksP2pRecv += 1; + + // Mark channels that need pre-connect + if (comm->rank != peer) { + if (!(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen)) { + // planner->peers[peer].send/recvSeen is private to each comm, so we need to set it anyway. + (isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen) = true; + int round = 0; + while (peer != (isSendNotRecv ? comm->p2pSchedule[round].sendRank + : comm->p2pSchedule[round].recvRank)) { + round += 1; + } + uint8_t base = ncclP2pChannelBaseForRound(comm, round); + for (int c=0; c < comm->p2pnChannelsPerPeer; c++) { + int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, c); + if (isSendNotRecv) { + if (comm->channels[channelId].peers[peer]->send[1].hasSeen == 0) { // P2P uses only 1 connector + // the send/recv connector is shared among split shared comms. We need to set hasSeen to + // 1 in order to avoid duplicate connection setup if user group sendrecv ops with split + // shared comms together. + comm->channels[channelId].peers[peer]->send[1].hasSeen = 1; + comm->channels[channelId].peers[peer]->send[1].p2pOnly = 1; + comm->connectSend[peer] |= (1UL<channels[channelId].peers[peer]->recv[1].hasSeen == 0) { // P2P uses only 1 connector + comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1; + comm->channels[channelId].peers[peer]->recv[1].p2pOnly = 1; + comm->connectRecv[peer] |= (1UL<planner; + + // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`. + ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective); + // Set capturing graph. Called here so that profiler can emit a group API event with this information + NCCLCHECK(ncclPlannerSetCapturingGraph(comm, info)); + bool isGraphCaptured = ncclCudaGraphValid(planner->capturingGraph); + NCCLCHECK(ncclProfilerStartGroupApiEvent(info, isGraphCaptured)); + NCCLCHECK(ncclProfilerRecordGroupApiEventState(ncclProfilerGroupStartApiStop)); + NCCLCHECK(ncclProfilerStartCollApiEvent(info, isGraphCaptured)); + + struct ncclTaskColl* t = ncclMemoryPoolAlloc(&comm->memPool_ncclTaskColl, &comm->memPermanent); + t->func = info->coll; + t->sendbuff = info->sendbuff; + t->recvbuff = info->recvbuff; + t->count = info->count; + t->root = info->root; + t->datatype = info->datatype; + size_t elementSize = ncclTypeSize(t->datatype); + if (t->func == ncclFuncAllGather || t->func == ncclFuncBroadcast) { + t->count *= elementSize; + t->datatype = ncclInt8; + elementSize = 1; + } + t->trafficBytes = t->count*elementSize*ncclFuncTrafficPerByte(t->func, comm->nRanks); + t->opHost = info->op; + t->opDev = opDev; // C++ struct assignment + t->chunkSteps = info->chunkSteps; + t->sliceSteps = info->sliceSteps; + t->eActivationMask = ncclProfilerApiState.eActivationMask; + t->groupApiEventHandle = ncclProfilerApiState.groupApiEventHandle; + t->collApiEventHandle = ncclProfilerApiState.collApiEventHandle; + + planner->nTasksColl += 1; + ncclTaskCollSorterInsert(&planner->collSorter, t, t->trafficBytes); + + ncclProfilerStopCollApiEvent(); + return ncclSuccess; +} + +static ncclResult_t ceCollTaskAppend( + struct ncclComm* comm, + struct ncclInfo* info, + struct ncclDevrWindow* sendWin, + struct ncclDevrWindow* recvWin, + struct ncclDevRedOpFull opDev) { + struct ncclKernelPlanner *planner = &comm->planner; + + // Check if CE needs initialization + if (comm->ceColl.baseUCSymReadyPtr == NULL && ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) { + struct ncclCeInitTask* ceTask; + NCCLCHECK(ncclCalloc(&ceTask, 1)); + ceTask->comm = comm; + ncclIntruQueueEnqueue(&comm->ceInitTaskQueue, ceTask); + ncclGroupCommJoin(comm, ncclGroupTaskTypeSymRegister); + } + + // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`. + ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective); + NCCLCHECK(ncclPlannerSetCapturingGraph(comm, info)); + struct ncclTaskColl* t = ncclMemoryPoolAlloc(&comm->memPool_ncclTaskColl, &comm->memPermanent); + + t->func = info->coll; + t->sendbuff = info->sendbuff; + t->recvbuff = info->recvbuff; + t->count = info->count; + t->root = info->root; + t->datatype = info->datatype; + size_t elementSize = ncclTypeSize(t->datatype); + if (t->func == ncclFuncAllGather || t->func == ncclFuncBroadcast) { + t->count *= elementSize; + t->datatype = ncclInt8; + elementSize = 1; + } + t->trafficBytes = t->count*elementSize*ncclFuncTrafficPerByte(t->func, comm->nRanks); + t->opHost = info->op; + t->opDev = opDev; // C++ struct assignment + t->chunkSteps = info->chunkSteps; + t->sliceSteps = info->sliceSteps; + t->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED); + t->sendWin = sendWin; + t->recvWin = recvWin; + + ncclIntruQueueEnqueue(&planner->collCeTaskQueue, t); + + return ncclSuccess; +} + +// Converts `info` to a task and adds it to `comm->planner`. The exception is with +// single rank communicators, collectives are issued as `ncclMemcpyAsync`s and +// thus don't need a task. +static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { + ncclFunc_t collAPI = info->coll; + + if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) { + NCCLCHECK(p2pTaskAppend(comm, info, info->coll, collAPI, (void*)info->recvbuff, info->count, info->datatype, info->root)); } else { // Empty collectives can be discarded. if (info->count == 0) return ncclSuccess; if (info->datatype == ncclFloat8e4m3 || info->datatype == ncclFloat8e5m2) { - if (comm->minCompCap < 90) { + if (comm->minCompCap < 90 && info->coll != ncclFuncAllGather && info->coll != ncclFuncBroadcast && info->coll != ncclFuncAlltoAll && info->coll != ncclFuncScatter && info->coll != ncclFuncGather) { WARN("FP8 reduction support begins with sm90 capable devices."); return ncclInvalidArgument; } @@ -2347,61 +2553,59 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { NCCLCHECK(ncclLaunchOneRank(info->recvbuff, info->sendbuff, info->count, opDev, info->datatype, info->stream)); return ncclSuccess; } else { - // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`. - ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective); - struct ncclTaskColl* t = ncclMemoryPoolAlloc(&comm->memPool_ncclTaskColl, &comm->memPermanent); - t->func = info->coll; - t->sendbuff = info->sendbuff; - t->recvbuff = info->recvbuff; - t->count = info->count; - t->root = info->root; - t->datatype = info->datatype; - size_t elementSize = ncclTypeSize(t->datatype); - if (t->func == ncclFuncAllGather || t->func == ncclFuncBroadcast) { - t->count *= elementSize; - t->datatype = ncclInt8; - elementSize = 1; + struct ncclDevrWindow* sendWin; + struct ncclDevrWindow* recvWin; + ncclDevrFindWindow(comm, info->sendbuff, &sendWin); + ncclDevrFindWindow(comm, info->recvbuff, &recvWin); + bool ceImplemented = ncclCeImplemented(info->coll, info->op, info->datatype); + + // Append CE collective task if CE is supported and requested by user + if (comm->symmetricSupport && comm->nNodes == 1 && sendWin && recvWin && (sendWin->winFlags & recvWin->winFlags & NCCL_WIN_COLL_SYMMETRIC) && comm->config.CTAPolicy == NCCL_CTA_POLICY_ZERO && ceImplemented) { + NCCLCHECK(ceCollTaskAppend(comm, info, sendWin, recvWin, opDev)); } - t->trafficBytes = t->count*elementSize*ncclFuncTrafficPerByte(t->func, comm->nRanks); - t->opHost = info->op; - t->opDev = opDev; // C++ struct assignment - t->chunkSteps = info->chunkSteps; - t->sliceSteps = info->sliceSteps; - t->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED); - - planner->nTasksColl += 1; - ncclTaskCollSorterInsert(&planner->collSorter, t, t->trafficBytes); - } - } - - if (info->stream != planner->streamRecent || planner->streams == nullptr) { - planner->streamRecent = info->stream; - struct ncclCudaStreamList* l = planner->streams; - while (true) { - if (l == nullptr) { // Got to the end, this must be a new stream. - struct ncclCudaGraph graph; - NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream)); - if (planner->streams != nullptr && !ncclCudaGraphSame(planner->capturingGraph, graph)) { - WARN("Streams given to a communicator within a NCCL group must either be all uncaptured or all captured by the same graph."); - return ncclInvalidUsage; + // Append kernel-based collective + else { + if (info->coll == ncclFuncAlltoAll) { + for (int r=0; rnRanks; r++) { + NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncSend, collAPI, (void*)((char*)info->sendbuff+r*info->count*ncclTypeSize(info->datatype)), info->count, info->datatype, r)); + NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncRecv, collAPI, (void*)((char*)info->recvbuff+r*info->count*ncclTypeSize(info->datatype)), info->count, info->datatype, r)); + } + } else if (info->coll == ncclFuncGather){ + size_t offset = 0; + NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncSend, collAPI, (void*)info->sendbuff, info->count, info->datatype, info->root)); + if (comm->rank == info->root) { + for (int r=0; rnRanks; r++) { + void* buff = (void*)((char*)info->recvbuff + offset); + NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncRecv, collAPI, buff, info->count, info->datatype, r)); + offset += info->count * ncclTypeSize(info->datatype); + } + } + } else if (info->coll == ncclFuncScatter) { + size_t offset = 0; + if (comm->rank == info->root) { + for (int r = 0; r < comm->nRanks; r++) { + void* buff = (void*)((char*)info->sendbuff + offset); + NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncSend, collAPI, buff, info->count, info->datatype, r)); + offset += info->count * ncclTypeSize(info->datatype); + } + } + NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncRecv, collAPI, (void*)info->recvbuff, info->count, info->datatype, info->root)); + } else { + NCCLCHECK(collTaskAppend(comm, info, opDev)); } - planner->capturingGraph = graph; // C++ struct assignment - // Add stream to list - l = ncclMemoryStackAlloc(&comm->memScoped); - l->stream = info->stream; - l->next = planner->streams; - planner->streams = l; - break; } - if (l->stream == info->stream) - break; // Already seen stream. - l = l->next; } } + return ncclSuccess; } ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { + // Profiler - If a group API event has already started, update the profilerGroupDepth so that the depth + // updates correctly for implicit ncclGroupStartInternal and ncclGroupEndInternal calls + if (ncclProfilerApiState.profilerGroupDepth > 0) { + ncclProfilerApiState.profilerGroupDepth++; + } NCCLCHECK(ncclGroupStartInternal()); ncclResult_t ret = ncclSuccess; int devOld = -1; diff --git a/src/graph/CMakeLists.txt b/src/graph/CMakeLists.txt new file mode 100644 index 000000000..1dec7cbf7 --- /dev/null +++ b/src/graph/CMakeLists.txt @@ -0,0 +1,14 @@ +# Graph sources +set(GRAPH_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/topo.cc + ${CMAKE_CURRENT_SOURCE_DIR}/tuning.cc + ${CMAKE_CURRENT_SOURCE_DIR}/xml.cc + ${CMAKE_CURRENT_SOURCE_DIR}/search.cc + ${CMAKE_CURRENT_SOURCE_DIR}/paths.cc + ${CMAKE_CURRENT_SOURCE_DIR}/connect.cc + ${CMAKE_CURRENT_SOURCE_DIR}/rings.cc + ${CMAKE_CURRENT_SOURCE_DIR}/trees.cc +) + +# Add graph sources to parent scope +set(GRAPH_SOURCES ${GRAPH_SOURCES} PARENT_SCOPE) diff --git a/src/graph/connect.cc b/src/graph/connect.cc index 152739b0c..c5fe959ae 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -21,6 +21,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs int localRanks = comm->topo->nodes[GPU].count; int nChannels = comm->nChannels; + topoRanks->crossNicRing = graphs[NCCL_ALGO_RING]->crossNic; topoRanks->nvlsHeadNum = 0; for (int c=0; cchannels+c; @@ -232,7 +233,6 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads); sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collnetDirect.headRank, channel->collnetDirect.out, channel->collnetDirect.shift); INFO(NCCL_GRAPH, "%s", line); - channel->collnetChain.depth = comm->nRanks/comm->nNodes; } free(heads); return ncclSuccess; @@ -249,7 +249,7 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead if (nvlsHeads[h * comm->nNodes + comm->node] == comm->rank) headRank = h; } - for (int c=0; cnChannels; c++) { + for (int c=0; cnvlsChannels; c++) { struct ncclChannel* channel = comm->channels+c; channel->nvls.nHeads = nHeads; for (int h=0; hnvls.up[h] = comm->nRanks+1+h; @@ -301,7 +301,7 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead } // Set prev/next in all channels (NVLS compute channels work // orthogonally to NVLS search channels). - for (int c=0; cnChannels; c++) { + for (int c=0; cnvlsChannels; c++) { struct ncclChannel* channel = comm->channels+c; channel->nvls.treeUp = treeUp[c%2]; channel->nvls.treeDown[0] = channel->nvls.down; @@ -389,17 +389,17 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa NCCLCHECKGOTO(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS), ret, fail); NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail); - // Alternate rings to avoid crossing rails - if (graphs[NCCL_ALGO_RING]->crossNic == 2 && (nChannels % 2) == 0) { - for (int r=0; rnRanks; r++) { - if (comm->rankToNode[r] % 2 == 1) { - // Exchange rings - for (int c=0; cringRecv+c, allTopoRanks[r]->ringRecv+(c^1)); - exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1)); - exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1)); - exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1)); - } + // Alternate rings to avoid crossing rails. + // CrossNic values could be not the same on all nodes as it depends on the number of net devs and the NVLink bandwidth. + // Therefore, it's only done if the rank obtained a solution with crossNic=2. + for (int r = 0; r < comm->nRanks; r++) { + if (allTopoRanks[r]->crossNicRing == 2 && (nChannels % 2) == 0 && (comm->rankToNode[r] % 2) == 1) { + // Exchange rings + for (int c=0; cringRecv+c, allTopoRanks[r]->ringRecv+(c^1)); + exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1)); + exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1)); + exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1)); } } } @@ -459,7 +459,14 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2); nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext); } - NCCLCHECKGOTO(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]), ret, fail); + + for (int c = 0; c < comm->nChannels; c++) { + comm->channels[c].collnetChain.depth = comm->nRanks/comm->nNodes; + } + + if (comm->maxLocalRanks <= NCCL_MAX_DIRECT_ARITY+1) { + NCCLCHECKGOTO(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]), ret, fail); + } } // Use 4 compute channels per search channel to reach peak BW on <8 PPN @@ -490,9 +497,6 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa if (shared && comm->nvlsChannels > parent->nvlsResources->nChannels) { comm->nvlsChannels = parent->nvlsResources->nChannels; } - if (comm->nChannels < comm->nvlsChannels) { - nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext); - } NCCLCHECKGOTO(connectNvls(comm, nvlsHeads, minHeadNum), ret, fail); #endif if (shared && comm->nChannels > parent->sharedRes->tpNChannels) { diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 82c0d9972..86d185bc0 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -375,11 +375,15 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn nvmlGpuFabricInfoV_t *fabricInfo1 = &info1->fabricInfo; nvmlGpuFabricInfoV_t *fabricInfo2 = &info2->fabricInfo; // A zero UUID means we don't have MNNVL fabric info - if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess; + unsigned long uuid0 = 0; + unsigned long uuid1 = 0; + memcpy(&uuid0, fabricInfo2->clusterUuid, sizeof(uuid0)); + memcpy(&uuid1, fabricInfo2->clusterUuid + sizeof(uuid0), sizeof(uuid1)); + if ((uuid0 | uuid1) == 0) return ncclSuccess; if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) && (fabricInfo1->cliqueId == fabricInfo2->cliqueId)) { TRACE(NCCL_NET, "MNNVL matching peer 0x%lx UUID %lx.%lx cliqueId 0x%x", - info2->busId, ((long *)fabricInfo2->clusterUuid)[0], ((long *)fabricInfo2->clusterUuid)[1], fabricInfo2->cliqueId); + info2->busId, uuid0, uuid1, fabricInfo2->cliqueId); *ret = 1; } return ncclSuccess; @@ -613,7 +617,7 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, return ncclSuccess; } -NCCL_PARAM(PxnC2c, "PXN_C2C", 0); +NCCL_PARAM(PxnC2c, "PXN_C2C", 1); ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm) { // Precompute paths between GPUs/NICs. @@ -793,7 +797,6 @@ void ncclTopoFree(struct ncclTopoSystem* system) { free(system); } -NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", -1); static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gpu index*/, int peerRank, int* nChannels) { int peer; @@ -815,8 +818,8 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gp } } else { // Remote rank, use network - int nNetChannels = ncclParamNChannelsPerNetPeer(); - if (nNetChannels == -1) { + int nNetChannels = comm->config.nChannelsPerNetPeer; + if (nNetChannels == NCCL_CONFIG_UNDEF_INT) { //start from 2 channels per NIC and reduce with scale nNetChannels = 2; diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 8fdf54ea4..3a87725f1 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -8,6 +8,7 @@ #include "graph.h" #include "topo.h" #include "comm.h" +#include "nccl.h" #include "nvmlwrap.h" #include "coll_net.h" #include "transport.h" @@ -15,6 +16,7 @@ #include #include "cpuset.h" #include "bootstrap.h" +#include #define BUSID_SIZE (sizeof("0000:00:00.0")) #define BUSID_REDUCED_SIZE (sizeof("0000:00")) @@ -404,7 +406,7 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s #define PCI_BRIDGE_DEVICE_CLASS "0x060400" -struct kvDict kvDictPciClass[] = { { PCI_BRIDGE_DEVICE_CLASS, PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } }; +struct kvDict kvDictPciClass[] = { { PCI_BRIDGE_DEVICE_CLASS, PCI }, {"0x080100", /*CX8 data direct*/PCI}, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } }; struct kvDict kvDictPciGen[] = { { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */ { "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 }, @@ -982,8 +984,7 @@ ncclResult_t ncclTopoMakePciParent(struct ncclXml* xml, struct ncclXmlNode** par return ncclSuccess; } -ncclResult_t ncclTopoMakeVnic(struct ncclXml* xml, ncclNetVDeviceProps_t* vProps, -struct ncclXmlNode** physNetNodes, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) { +ncclResult_t ncclTopoMakeVnic(struct ncclXml* xml, struct ncclTopoNetInfo* netInfo, ncclNetVDeviceProps_t* vProps, struct ncclXmlNode** physNetNodes) { if (vProps->ndevs > NCCL_NET_MAX_DEVS_PER_NIC) { WARN("TOPO/NET : Tried to merge too many NICs. %d > %d", vProps->ndevs, NCCL_NET_MAX_DEVS_PER_NIC); return ncclInternalError; @@ -997,7 +998,7 @@ struct ncclXmlNode** physNetNodes, ncclResult_t (*makeVDevice)(int*, ncclNetVDev // Trigger the merge, then get the new device's properties int vDevIndex = 0; - ncclResult_t ret = makeVDevice(&vDevIndex, vProps); + ncclResult_t ret = netInfo->makeVDevice(&vDevIndex, vProps); if (ret != ncclSuccess) { INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "TOPO/NET : Tried merging multiple devices together and failed. vProps={ndevs=%d, devs=[%d %d %d %d]}. Set NCCL_NET_MERGE_LEVEL=LOC to disable NIC fusion.", vProps->ndevs, vProps->devs[0], vProps->devs[1], vProps->devs[2], vProps->devs[3]); @@ -1015,9 +1016,10 @@ struct ncclXmlNode** physNetNodes, ncclResult_t (*makeVDevice)(int*, ncclNetVDev return ncclSuccess; } -ncclResult_t ncclTopoForceMerge(struct ncclXml* xml, char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) { +ncclResult_t ncclTopoForceMerge(struct ncclXml* xml, struct ncclTopoNetInfo* netInfo, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs) { ncclResult_t ret = ncclSuccess; - INFO(NCCL_ENV|NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str); + const char* str = netInfo->forceMerge; + INFO(NCCL_ENV | NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str); char* ncStr; NCCLCHECK(ncclCalloc(&ncStr, strlen(str)+1)); strcpy(ncStr, str); @@ -1053,7 +1055,7 @@ ncclResult_t ncclTopoForceMerge(struct ncclXml* xml, char* str, int* placedDevs, goto fail; } - ret = ncclTopoMakeVnic(xml, &vProps, physNetNodes, makeVDevice); + ret = ncclTopoMakeVnic(xml, netInfo, &vProps, physNetNodes); if (ret == ncclSuccess) { // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this) for (int i = 0; i < vProps.ndevs; i++) { @@ -1075,7 +1077,7 @@ ncclResult_t ncclTopoForceMerge(struct ncclXml* xml, char* str, int* placedDevs, goto exit; } -ncclResult_t ncclTopoAutoMerge(struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) { +ncclResult_t ncclTopoAutoMerge(struct ncclXml* xml, struct ncclTopoNetInfo* netInfo, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs) { // Compute the path type between each device int* paths = NULL; ncclResult_t res = ncclSuccess; @@ -1105,7 +1107,7 @@ ncclResult_t ncclTopoAutoMerge(struct ncclXml* xml, int mergeLevel, int* placedD // Select each unplaced device "j" which is at most "mergeLevel" distance from "i", but not equal to "i" // (Don't merge the same device with itself) for (int j = 0; j < nPhysDevs; j++) { - if (paths[i*nPhysDevs + j] <= mergeLevel && + if (paths[i*nPhysDevs + j] <= netInfo->mergeLevel && placedDevs[j] == 0 && j != i) { vProps.devs[vProps.ndevs++] = j; placedDevs[j] = 1; @@ -1119,7 +1121,7 @@ ncclResult_t ncclTopoAutoMerge(struct ncclXml* xml, int mergeLevel, int* placedD return ncclInternalError; } - ncclResult_t ret = ncclTopoMakeVnic(xml, &vProps, physNetNodes, makeVDevice); + ncclResult_t ret = ncclTopoMakeVnic(xml, netInfo, &vProps, physNetNodes); // Merging failed. // Mark all as unplaced and increase their distance to disconnected (PATH_DIS) @@ -1157,6 +1159,92 @@ struct kvDict nicPathKvList[] = { { NULL, 0 } }; + +ncclResult_t ncclTopoFindLinkWidthRec(ncclXmlNode* node, ncclXmlNode** physNetNodes, int ndevs, int* foundPhysNet, int* linkWidth) { + int myLinkWidth = 0; + if (strcmp(node->name, "pci") == 0) { + NCCLCHECK(xmlGetAttrInt(node, "link_width", &myLinkWidth)); +#ifdef ENABLE_TRACE + const char *busidAttr, *linkAttr; + NCCLCHECK(xmlGetAttrStr(node, "busid", &busidAttr)); + NCCLCHECK(xmlGetAttr(node, "link_width", &linkAttr)); + TRACE(NCCL_GRAPH, "Found link_width (%s)=%d for busid=%s", linkAttr, myLinkWidth, busidAttr); +#endif + } + + *foundPhysNet = 0; + // Detect if a physical child is found. This information will be propagated up the stack. + int devId = 0; + while (devId < ndevs && !(*foundPhysNet)) *foundPhysNet = (node == physNetNodes[devId++]); + + int totalChildLinkWidth = 0; + for (int i = 0; i < node->nSubs; i++) { + ncclXmlNode* child = node->subs[i]; + int found = 0; + int tempLinkWidth = 0; + NCCLCHECK(ncclTopoFindLinkWidthRec(child, physNetNodes, ndevs, &found, &tempLinkWidth)); + if (found) { + *foundPhysNet = 1; + totalChildLinkWidth += tempLinkWidth; + } + } + + if (*foundPhysNet == 0) { + // No child NICs were found, do not accrue any detected link_width + *linkWidth = 0; + INFO(NCCL_GRAPH, "Did not find child net device. Returning link_width=%d totalChildLinkWidth=%d", *linkWidth, totalChildLinkWidth); + } else if (totalChildLinkWidth == 0) { + // If A child NIC was found but no link_width was detected among children, assign the link_width to mine (I am the first pci node right above the physNetNode). + *linkWidth = myLinkWidth; + INFO(NCCL_GRAPH, "Found child net device for %s. Returning link_width=%d totalChildLinkWidth=%d", node->name, *linkWidth, totalChildLinkWidth); + } else { + // Standard recursive accrual of link_width. The link_width is either the bottleneck of this PCI node's width or the sum of its children's width. + *linkWidth = myLinkWidth > 0 ? std::min(myLinkWidth, totalChildLinkWidth) : totalChildLinkWidth; + INFO(NCCL_GRAPH, "Found child net device for %s. Returning link_width=%d totalChildLinkWidth=%d", node->name, *linkWidth, totalChildLinkWidth); + } + + return ncclSuccess; +} + +// DFS over nodes under common parent +// Exclude link widths of non-physNetNodes chains +ncclResult_t ncclTopoFindLinkWidth(ncclXmlNode* parent, ncclXmlNode** physNetNodes, int ndevs, int* linkWidth) { + *linkWidth = 0; + for (int i = 0; i < parent->nSubs; i++) { + ncclXmlNode* child = parent->subs[i]; + int foundPhysNet = 0; + int childLinkWidth = 0; + NCCLCHECK(ncclTopoFindLinkWidthRec(child, physNetNodes, ndevs, &foundPhysNet, &childLinkWidth)); + if (foundPhysNet) { + *linkWidth += childLinkWidth; + } + } + + return ncclSuccess; +} + +ncclResult_t ncclTopoWidenLinks(ncclXmlNode** physNetNodes, int ndevs, ncclXmlNode* parent) { + int sumLinkWidth = 0; + NCCLCHECK(ncclTopoFindLinkWidth(parent, physNetNodes, ndevs, &sumLinkWidth)); + for (int i = 0; i < ndevs; i++) { + ncclXmlNode* temp = physNetNodes[i]; + while (temp != parent) { + if (strcmp(temp->name, "pci") == 0) { + NCCLCHECK(xmlSetAttrInt(temp, "link_width", sumLinkWidth)); + TRACE(NCCL_GRAPH, "Set link_width to %d for node %s", sumLinkWidth, temp->name); + } + temp = temp->parent; + } + } + + if (strcmp(parent->name, "pci") == 0) { + NCCLCHECK(xmlSetAttrInt(parent, "link_width", sumLinkWidth)); + TRACE(NCCL_GRAPH, "Set link_width to %d for node %s", sumLinkWidth, parent->name); + } + + return ncclSuccess; +} + ncclResult_t ncclTopoGetVNicParent(struct ncclXml* xml, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclNetVDeviceProps_t* vProps, ncclXmlNode** parent) { ncclNetProperties_t props[NCCL_NET_MAX_DEVS_PER_NIC]; ncclXmlNode* physNetNodes[NCCL_NET_MAX_DEVS_PER_NIC]; @@ -1170,54 +1258,50 @@ ncclResult_t ncclTopoGetVNicParent(struct ncclXml* xml, ncclResult_t (*getProper int path = PATH_LOC; NCCLCHECK(ncclTopoGetPath(physNetNodes, vProps->ndevs, &path, parent)); - if (path == PATH_LOC) { - *parent = NULL; - } else if (parent && strcmp((*parent)->name, "pci") == 0) { - // Compare PCI class here to avoid NCCL WARN when the "class" attribute doesn't exist - const char* c; - NCCLCHECK(xmlGetAttrStr(*parent, "class", &c)); - if (strcmp(c, PCI_BRIDGE_DEVICE_CLASS) == 0) { + if (path == PATH_PHB || path == PATH_PXB || path == PATH_PIX) { + INFO(NCCL_GRAPH, "Widening links"); + NCCLCHECK(ncclTopoWidenLinks(physNetNodes, vProps->ndevs, *parent)); + } + + if (*parent) { + if (strcmp((*parent)->name, "pci") == 0) { + // Compare PCI class here to avoid NCCL WARN when the "class" attribute doesn't exist + const char* c; + NCCLCHECK(xmlGetAttrStr(*parent, "class", &c)); + if (c && strcmp(c, PCI_BRIDGE_DEVICE_CLASS) == 0) { + // If the common parent is a PCI switch, we must reparent the new NIC under a made up pci device with a unique busid + NCCLCHECK(ncclTopoMakePciParent(xml, parent, physNetNodes[0])); + } + } else if (strcmp((*parent)->name, "cpu") == 0) { // If the common parent is a PCI switch, we must reparent the new NIC under a made up pci device with a unique busid NCCLCHECK(ncclTopoMakePciParent(xml, parent, physNetNodes[0])); } } + TRACE(NCCL_GRAPH, "Selected parent %s with path %d", (*parent)->name, path); return ncclSuccess; } -ncclResult_t ncclTopoMakeVNics(struct ncclXml* xml, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*getProperties)(int, ncclNetProperties_t*), int physicalDevs) { +ncclResult_t ncclTopoMakeVNics(struct ncclXml* xml, struct ncclTopoNetInfo* netInfo, int physicalDevs) { int* placedDevs = NULL; struct ncclXmlNode** physNetNodes = NULL; - if (physicalDevs == 0) return ncclSuccess; - - ncclCalloc(&physNetNodes, physicalDevs); + ncclNetProperties_t* props = NULL; ncclResult_t res = ncclSuccess; + if (physicalDevs == 0) return ncclSuccess; - ncclNetProperties_t* props = NULL; - ncclCalloc(&props, physicalDevs); + NCCLCHECK(ncclCalloc(&physNetNodes, physicalDevs)); + NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs)); + NCCLCHECK(ncclCalloc(&props, physicalDevs)); for (int i = 0; i < physicalDevs; i++) { - NCCLCHECKGOTO(getProperties(i, props + i), res, out); + NCCLCHECKGOTO(netInfo->getProperties(i, props + i), res, out); struct ncclXmlNode* physNetNode; NCCLCHECKGOTO(xmlFindTagKv(xml, "net", &physNetNode, "name", props[i].name), res, out); physNetNodes[i] = physNetNode; TRACE(NCCL_GRAPH, "Found physical ncclNet node %d %s", i, props[i].name); } - // By default, don't merge any devices - int mergeLevel; - mergeLevel = PATH_PORT; - { // Avoids warnings related to jumping to "out" - const char* mergeLevelEnv = ncclGetEnv("NCCL_NET_MERGE_LEVEL"); - if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList); - char* forceMerge = (char*) ncclGetEnv("NCCL_NET_FORCE_MERGE"); - NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs)); - memset(placedDevs, 0, sizeof(int)*physicalDevs); - - if (forceMerge) { - NCCLCHECKGOTO(ncclTopoForceMerge(xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out); - } - } - NCCLCHECKGOTO(ncclTopoAutoMerge(xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out); + if (netInfo->forceMerge) NCCLCHECKGOTO(ncclTopoForceMerge(xml, netInfo, placedDevs, props, physNetNodes, physicalDevs), res, out); + NCCLCHECKGOTO(ncclTopoAutoMerge(xml, netInfo, placedDevs, props, physNetNodes, physicalDevs), res, out); out: free(physNetNodes); @@ -1226,10 +1310,10 @@ ncclResult_t ncclTopoMakeVNics(struct ncclXml* xml, ncclResult_t (*makeVDevice)( return res; } -static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIndex, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), const char* netName, int coll, int virtualNics, bool dmaBufSupport) { +static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIndex, struct ncclTopoNetInfo* netInfo, int virtualNics) { for (int n = startIndex; n < endIndex; n++) { ncclNetProperties_t props; - NCCLCHECK(getProperties(n, &props)); + NCCLCHECK(netInfo->getProperties(n, &props)); struct ncclXmlNode* netNode = NULL; struct ncclXmlNode* parent = NULL; if (virtualNics) { @@ -1237,7 +1321,7 @@ static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIn NCCLCHECK(xmlFindTagKv(xml, "net", &net, "name", props.name)); // In the event of multithreaded use case, we need to re-discover the shared parent of the given devices for this vNIC // Only run this if the net doesn't exist locally - this may alter the XML state - if (net == NULL) NCCLCHECK(ncclTopoGetVNicParent(xml, getProperties, &props.vProps, &parent)); + if (net == NULL) NCCLCHECK(ncclTopoGetVNicParent(xml, netInfo->getProperties, &props.vProps, &parent)); } NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode, parent)); @@ -1248,18 +1332,18 @@ static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIn NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1)); int dev; xmlGetAttrIntDefault(netNode, "dev", &dev, -1); - if (dev != -1 && dev != n) INFO(NCCL_GRAPH, "TOPO/NET : Changing %s dev index from %d to %d", netName, dev, n); + if (dev != -1 && dev != n) INFO(NCCL_GRAPH, "TOPO/NET : Changing %s dev index from %d to %d", netInfo->name, dev, n); NCCLCHECK(xmlSetAttrInt(netNode, "dev", n)); NCCLCHECK(xmlInitAttrInt(netNode, "latency", props.latency)); NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed)); NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port)); NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid)); NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms)); - bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF)); - INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", netName, gdrSupport ? "Enabled" : "Disabled", n, props.name); + bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (netInfo->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF)); + INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", netInfo->name, gdrSupport ? "Enabled" : "Disabled", n, props.name); NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport)); // Only set coll if it's not 0 - if (coll) NCCLCHECK(xmlInitAttrInt(netNode, "coll", coll)); + if (netInfo->coll) NCCLCHECK(xmlInitAttrInt(netNode, "coll", netInfo->coll)); const char* keepAttr; NCCLCHECK(xmlGetAttr(netNode, "coll", &colAttr)); @@ -1272,51 +1356,45 @@ static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIn } // Calls to network plugin APIs should be protected. This function should be called inside a per-process lock. -ncclResult_t ncclTopoProcessNet(ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName, bool dmaBufSupport) { - int usePhysicalDevices = (dumpXmlFile || makeVDevice == NULL); - if (state->nPhysicalNics == -1) NCCLCHECK(devices(&state->nPhysicalNics)); - // Enumerate physical devices - NCCLCHECK(ncclTopoPopulateNics(xml, 0, state->nPhysicalNics, getProperties, netName, coll, false, dmaBufSupport)); +ncclResult_t ncclTopoProcessNet(ncclXml* xml, const char* dumpXmlFile, struct ncclTopoNetInfo* net) { + bool usePhysicalDevices = (dumpXmlFile || net->makeVDevice == NULL); + int nPhysicalNics, nVirtualNics; + NCCLCHECK(net->getDevCount(net->netPluginIndex, &nPhysicalNics, &nVirtualNics)); + // List the physical devices in the topo + NCCLCHECK(ncclTopoPopulateNics(xml, 0, nPhysicalNics, net, /*virtual=*/false)); if (!usePhysicalDevices) { - if (state->nVirtualNics == -1) { - NCCLCHECK(ncclTopoMakeVNics(xml, makeVDevice, getProperties, state->nPhysicalNics)); + // Virtual devices are only created once per network + if (nVirtualNics == NCCL_UNDEF_DEV_COUNT) { + NCCLCHECK(ncclTopoMakeVNics(xml, net, nPhysicalNics)); + // Update the number of virtual devices both locally and in the state tracking the plugin. + // Note: 0 is a valid number of virtual devices int nDevs; - NCCLCHECK(devices(&nDevs)); - state->nVirtualNics = nDevs - state->nPhysicalNics; + NCCLCHECK(net->devices(&nDevs)); + nVirtualNics = nDevs - nPhysicalNics; + NCCLCHECK(net->setVirtDevCount(net->netPluginIndex, nVirtualNics)); } - if (state->nVirtualNics > 0) { - // Populate new devices - NCCLCHECK(ncclTopoPopulateNics(xml, state->nPhysicalNics, state->nPhysicalNics+state->nVirtualNics, getProperties, netName, coll, true, dmaBufSupport)); + // populate the virtual devices if any + if (nVirtualNics > 0) { + NCCLCHECK(ncclTopoPopulateNics(xml, nPhysicalNics, nPhysicalNics + nVirtualNics, net, /*virtual=*/true)); } } return ncclSuccess; } -static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER; -ncclTopoNetState netStates[NCCL_NET_MAX_PLUGINS] = {}; -ncclTopoNetState collNetStates[NCCL_NET_MAX_PLUGINS] = {}; -ncclResult_t ncclTopoGetSharedState(ncclTopoNetState** state, const char* name, ncclTopoNetState* states) { - INFO(NCCL_GRAPH, "Retrieving state for %s", name); - for (int i = 0; i < NCCL_NET_MAX_PLUGINS; i++) { - // Empty slot - if (states[i].name == NULL) { - states[i].nVirtualNics = -1; - states[i].nPhysicalNics = -1; - states[i].name = strdup(name); - *state = states + i; - INFO(NCCL_GRAPH, "Initialized state %d for %s", i, name); - return ncclSuccess; - // Found my slot - } else if (strcmp(states[i].name, name) == 0) { - *state = states + i; - return ncclSuccess; - } +ncclResult_t ncclTopoGetFusionEnv(int* mergeLevel, const char** forceMerge) { + if (forceMerge) *forceMerge = ncclGetEnv("NCCL_NET_FORCE_MERGE"); + const char* mergeLevelEnv = ncclGetEnv("NCCL_NET_MERGE_LEVEL"); + if (mergeLevelEnv) { + kvConvertToInt(mergeLevelEnv, mergeLevel, nicPathKvList); + } else { + *mergeLevel = PATH_PORT; } - WARN("NET/TOPO : Couldn't find net with name %s", name); - return ncclInternalError; + return ncclSuccess; } +static std::mutex netMutex; + ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system, const char* dumpXmlFile) { ncclResult_t ret = ncclSuccess; struct ncclXml* xml; @@ -1324,7 +1402,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy int* localRanks = NULL; struct ncclXml* rankXml; int localRank = -1, nLocalRanks = 0; - int netLockHeld = 0; + struct ncclTopoNetInfo netInfo = {0}; NCCLCHECK(xmlAlloc(&xml, NCCL_TOPO_XML_MAX_NODES)); const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE"); if (xmlTopoFile) { @@ -1364,21 +1442,35 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes, // so we start with collnet so that it has precedence. - pthread_mutex_lock(&netLock); - netLockHeld = 1; - INFO(NCCL_GRAPH, "TOPO/NET : Importing network plugins to topology"); - ncclTopoNetState* state; - state = NULL; - if (collNetSupport(comm)) { - NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclCollNet->name, collNetStates), ret, fail); - NCCLCHECKGOTO(ncclTopoProcessNet(xml, 1, dumpXmlFile, state, - comm->ncclCollNet->getProperties, comm->ncclCollNet->makeVDevice, comm->ncclCollNet->devices, comm->ncclCollNet->name, comm->dmaBufSupport), ret, fail); - } - NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclNet->name, netStates), ret, fail); - NCCLCHECKGOTO(ncclTopoProcessNet(xml, 0, dumpXmlFile, state, - comm->ncclNet->getProperties, comm->ncclNet->makeVDevice, comm->ncclNet->devices, comm->ncclNet->name, comm->dmaBufSupport), ret, fail); - pthread_mutex_unlock(&netLock); - netLockHeld = 0; + { + std::lock_guard lock(netMutex); + INFO(NCCL_GRAPH, "TOPO/NET : Importing network plugins to topology"); + if (collNetSupport(comm)) { + netInfo.coll = 1; + netInfo.netPluginIndex = comm->netPluginIndex; + netInfo.dmaBufSupport = comm->dmaBufSupport; + netInfo.getDevCount = ncclCollNetGetDevCount; + netInfo.setVirtDevCount = ncclCollNetSetVirtDevCount; + netInfo.name = comm->ncclCollNet->name; + netInfo.getProperties = comm->ncclCollNet->getProperties; + netInfo.makeVDevice = comm->ncclCollNet->makeVDevice; + netInfo.devices = comm->ncclCollNet->devices; + NCCLCHECK(ncclTopoGetFusionEnv(&netInfo.mergeLevel, &netInfo.forceMerge)); + NCCLCHECKGOTO(ncclTopoProcessNet(xml, dumpXmlFile, &netInfo), ret, fail); + } + + netInfo.coll = 0; + netInfo.netPluginIndex = comm->netPluginIndex; + netInfo.dmaBufSupport = comm->dmaBufSupport; + netInfo.getDevCount = ncclNetGetDevCount; + netInfo.setVirtDevCount = ncclNetSetVirtDevCount; + netInfo.name = comm->ncclNet->name; + netInfo.getProperties = comm->ncclNet->getProperties; + netInfo.makeVDevice = comm->ncclNet->makeVDevice; + netInfo.devices = comm->ncclNet->devices; + NCCLCHECK(ncclTopoGetFusionEnv(&netInfo.mergeLevel, &netInfo.forceMerge)); + NCCLCHECKGOTO(ncclTopoProcessNet(xml, dumpXmlFile, &netInfo), ret, fail); + } // Remove XML branches which don't have a node with keep="1" (typically when importing a topology) NCCLCHECKGOTO(ncclTopoTrimXml(xml), ret, fail); @@ -1436,7 +1528,6 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy free(xml); return ret; fail: - if (netLockHeld) pthread_mutex_unlock(&netLock); goto exit; } @@ -1491,6 +1582,38 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c return ncclSuccess; } +enum netDevsPolicy { + NETDEVS_POLICY_AUTO = 0x0, + NETDEVS_POLICY_ALL = 0x1, + NETDEVS_POLICY_MAX = 0x2, + NETDEVS_POLICY_UNDEF = 0xffffffff +}; + +static enum netDevsPolicy netDevsPolicy = NETDEVS_POLICY_UNDEF; +static int netDevsPolicyNum = -1; + +static void getNetDevsPolicyOnce() { + const char* envStr = ncclGetEnv("NCCL_NETDEVS_POLICY"); + if (envStr) { + if (strcasecmp(envStr, "AUTO") == 0) { + netDevsPolicy = NETDEVS_POLICY_AUTO; + } else if (strcasecmp(envStr, "ALL") == 0) { + netDevsPolicy = NETDEVS_POLICY_ALL; + } else if (strncasecmp(envStr, "MAX:", strlen("MAX:")) == 0) { + int envNum = atoi(envStr + strlen("MAX:")); + if (envNum > 0) { + netDevsPolicy = NETDEVS_POLICY_MAX; + netDevsPolicyNum = envNum; + } + } + if (netDevsPolicy == NETDEVS_POLICY_UNDEF) + INFO(NCCL_ENV, "Unable to recognize NCCL_NETDEVS_POLICY=%s, using NCCL_NETDEVS_POLICY_AUTO instead.", envStr); + else + INFO(NCCL_ENV, "NCCL_NETDEVS_POLICY set by environment to %s", envStr); + } + if (netDevsPolicy == NETDEVS_POLICY_UNDEF) netDevsPolicy = NETDEVS_POLICY_AUTO; +} + ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) { int gpu; NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu, /*showWarn=*/true)); @@ -1503,13 +1626,30 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch return ncclInternalError; } - int localGpus[NCCL_TOPO_MAX_NODES]; - int localGpuCount; - NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL)); + static pthread_once_t once = PTHREAD_ONCE_INIT; + pthread_once(&once,getNetDevsPolicyOnce); + int netsPerGpu = 0; + if (netDevsPolicy == NETDEVS_POLICY_AUTO) { + int localGpus[NCCL_TOPO_MAX_NODES]; + int localGpuCount; + NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL)); + netsPerGpu = DIVUP(localNetCount, localGpuCount); + } else if (netDevsPolicy == NETDEVS_POLICY_ALL) { + netsPerGpu = localNetCount; + } else if (netDevsPolicy == NETDEVS_POLICY_MAX) { + if (netDevsPolicyNum <= 0) { + WARN("Invalid number of network devices = %d for policy MAX", netDevsPolicyNum); + return ncclInternalError; + } + netsPerGpu = std::min(netDevsPolicyNum, localNetCount); + } else { + WARN("Unknown netDevs policy"); + return ncclInternalError; + } int net = system->nodes[GPU].nodes[gpu].gpu.dev; if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount); - net += channelId%(DIVUP(localNetCount,localGpuCount)); + net += channelId%(netsPerGpu); if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id; if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev; return ncclSuccess; @@ -1567,25 +1707,10 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu cpu_set_t mask; SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity"); -#ifdef ENABLE_TRACE - { - char affinityStr[sizeof(cpu_set_t)*2]; - TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", gpu->gpu.dev, - ncclCpusetToRangeStr(&mask, affinityStr, sizeof(affinityStr))); - } -#endif - // Get the affinity of the CPU close to our GPU. cpu_set_t cpuMask = cpu->cpu.affinity; -#ifdef ENABLE_TRACE - { - char affinityStr[sizeof(cpu_set_t)*2]; - TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", gpu->gpu.dev, - ncclCpusetToRangeStr(&cpuMask, affinityStr, sizeof(affinityStr))); - } -#endif - + // Get the final affinity cpu_set_t finalMask; if (ncclParamIgnoreCpuAffinity()) // Ignore the CPU affinity set and use the GPU one instead @@ -1596,12 +1721,22 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu memcpy(affinity, &finalMask, sizeof(cpu_set_t)); - // If there is a non empty set, use it to set affinity + // display the final affinity + char msg[1024] = ""; + snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), "Affinity for GPU %d is ", gpu->gpu.dev); if (CPU_COUNT(&finalMask)) { - char affinityStr[sizeof(cpu_set_t)*2]; - INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev, - ncclCpusetToRangeStr(&finalMask, affinityStr, sizeof(affinityStr))); + (void)ncclCpusetToRangeStr(&finalMask, msg + strlen(msg), sizeof(msg) - strlen(msg)); + } else { + snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), "empty, ignoring"); + } + snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), ". (GPU affinity = "); + (void)ncclCpusetToRangeStr(&cpuMask, msg + strlen(msg), sizeof(msg) - strlen(msg)); + if (!ncclParamIgnoreCpuAffinity()) { + snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), " ; CPU affinity = "); + (void)ncclCpusetToRangeStr(&mask, msg + strlen(msg), sizeof(msg) - strlen(msg)); } + snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), ")."); + INFO(NCCL_INIT, "%s: %s", __func__, msg); return ncclSuccess; } diff --git a/src/graph/topo.h b/src/graph/topo.h index 9ef10ff2d..49d408d95 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -190,12 +190,26 @@ ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max); ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink); -struct ncclTopoNetState { - int nVirtualNics; - int nPhysicalNics; +struct ncclTopoNetInfo { + bool coll; + // communicator-specific information + int netPluginIndex; + bool dmaBufSupport; + // NIC fusion + int mergeLevel; + const char* forceMerge; + // dev count tracking functions (not part of ncclNet) + ncclResult_t (*getDevCount)(int, int*, int*); + ncclResult_t (*setVirtDevCount)(int, int); + // ncclNet API functions const char* name; + ncclResult_t (*getProperties)(int, ncclNetProperties_t*); + ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*); + ncclResult_t (*devices)(int*); }; -ncclResult_t ncclTopoProcessNet(ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName, bool dmaBufSupport); + +ncclResult_t ncclTopoProcessNet(ncclXml* xml, const char* dumpXmlFile, struct ncclTopoNetInfo* net); +ncclResult_t ncclTopoGetFusionEnv(int* mergeLevel, const char** forceMerge); #define NCCL_TOPO_XML_MAX_NODES 256 #define NCCL_GRAPH_XML_MAX_NODES 4096 @@ -240,6 +254,8 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in return ncclInternalError; } +extern struct kvDict nicPathKvList[]; + static ncclResult_t ncclTopoIdToNetDev(struct ncclTopoSystem* system, int64_t id, int* netDev) { *netDev = -1; for (int i=0; inodes[NET].count; i++) { diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index 8e99f18c3..bfb279850 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -8,6 +8,7 @@ #include "device.h" #include "comm.h" #include "topo.h" +#include "nccl_tuner.h" NCCL_PARAM(Nthreads, "NTHREADS", -2); NCCL_PARAM(Ll128Nthreads, "LL128_NTHREADS", -2); @@ -129,63 +130,72 @@ ncclResult_t parseList(const char* str, const char* prefixElems[], int nprefixes goto exit; } -// Latencies in us, Bandwidths in GB/s -// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple } -static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { - { 6.8, 14.0, 8.4 }, { 6.6, 14.0, 8.4 }, // Tree, Ring - { 0, 0, 0 }, { 0, 0, 0 }, // Collnet Direct, Chain - { 0, 0, 0 }, { 0, 0, 0 }}; // NVLS, NVLS Tree - -// NVLink, PCI, Network -#define NCCL_HW_NVLINK 0 -#define NCCL_HW_PCI 1 -#define NCCL_HW_NET 2 -static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = -{ /* NVLINK */ - { /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 4.0 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 }, - /* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 }, - /* NVLS */ { 0, 0, 25 }, /* NVLSTree */ { 0, 0, 25 } }, - /* PCI */ - { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 4.0 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 }, - /* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 }, - /* NVLS */ { 0, 0, 0 }, /* NVLSTree */ { 0, 0, 0 } }, - /* NET */ - { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 14 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 }, - /* CollNetDirect (Simple)*/ { 0, 0, 31 }, /* CollNetChain (Simple)*/ { 0, 0, 30 }, - /* NVLS */ { 0, 0, 18 }, /* NVLSTree */ { 0, 0, 14 } } -}; - -/* Array indexes used below */ -#define VOLTA_COMPCAP_IDX 0 -#define AMPERE_COMPCAP_IDX 1 -#define HOPPER_COMPCAP_IDX 2 -#define BLACKWELL_COMPCAP_IDX 3 - -// LL128 max BW per channel -static const double llMaxBws[][3] = { - /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4}, - /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0}, - /* Hopper-N1/AMD-N2/AMD-N4) */ {141.0, 45.0 /*avg of ring & tree*/, 35.0}, - /* Blackwell-N1/AMD-N2/AMD-N4) */ {2*141.0, 2*45.0 /*avg of ring & tree*/, 2*35.0}, +// NVLS efficiency factor. +static const float nvlsEfficiency[NCCL_NUM_COMPCAPS] = { + 0.0f, // Volta + 0.0f, // Ampere + 0.85f, // Hopper + 0.74f, // Blackwell }; -static const double perChMaxRingLL128Bws[][3] = { - /* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0}, - /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0}, - /* Hopper (N1/N2/N4) */ {36.7, 36.7, 36.7}, - /* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*36.7}, -}; -static const double perChMaxTreeLL128Bws[][3] = { - /* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0}, - /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0}, - /* Hopper (N1/N2/N4) */ {36.7, 36.7, 29.0}, - /* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*29.0}, -}; -static const double perChMaxTreeBws[][3] = { - /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0}, - /* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8}, - /* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0}, - /* Blackwell (N1/N2/N4) */ {2*38.7, 2*41.4, 2*36.0}, +// Default tuner constants +static const ncclTunerConstants_t ncclTunerConstantsDefaults = { + .baseLatencies = { + { 6.8, 14.0, 8.4 }, { 6.6, 14.0, 8.4 }, // Tree, Ring + { 0, 0, 0 }, { 0, 0, 0 }, // Collnet Direct, Chain + { 0, 0, 0 }, { 0, 0, 0 }, // NVLS, NVLS Tree + { 8.0, 8.0, 8.0 } // PAT + }, + .hwLatencies = { + /* NVLINK */ + { { .6, 1.25, 4.0 }, { .6, 1.9, 3.4 }, /* Tree (LL/LL128/Simple), Ring (LL/LL128/Simple)*/ + { 0, 0, 3.7 }, { 0, 0, 2.8 }, /* CollNetDirect (LL/LL128/Simple), CollNetChain (LL/LL128/Simple)*/ + { 0, 0, 25 }, { 0, 0, 25 }, /* NVLS (LL/LL128/Simple), NVLSTree (LL/LL128/Simple)*/ + { 0, 0, 4.0 } /* PAT (LL/LL128/Simple)*/ + }, + /* PCI */ + { { 1.0, 1.9, 4.0 }, { 1.0, 2.5, 5.7 }, /* Tree (LL/LL128/Simple), Ring (LL/LL128/Simple)*/ + { 0, 0, 3.7 }, { 0, 0, 2.8 }, /* CollNetDirect (LL/LL128/Simple), CollNetChain (LL/LL128/Simple)*/ + { 0, 0, 0 }, { 0, 0, 0 }, /* NVLS (LL/LL128/Simple), NVLSTree (LL/LL128/Simple)*/ + { 0, 0, 4.0 } /* PAT (LL/LL128/Simple)*/ + }, + /* NET */ + { { 5.0, 8.5, 14 }, { 2.7, 4.0, 14.0 }, /* Tree (LL/LL128/Simple), Ring (LL/LL128/Simple)*/ + { 0, 0, 31 }, { 0, 0, 30 }, /* CollNetDirect (LL/LL128/Simple), CollNetChain (LL/LL128/Simple)*/ + { 0, 0, 18 }, { 0, 0, 14 }, /* NVLS (LL/LL128/Simple), NVLSTree (LL/LL128/Simple)*/ + { 0, 0, 14 } /* PAT (LL/LL128/Simple)*/ + }, + }, + .llMaxBws = { + {39.0, 39.0, 20.4}, /* Volta-N1/Intel-N2/Intel-N4) */ + {87.7, 22.5 /*avg of ring & tree*/, 19.0}, /* Ampere-N1/AMD-N2/AMD-N4) */ + {141.0, 45.0 /*avg of ring & tree*/, 35.0}, /* Hopper-N1/AMD-N2/AMD-N4) */ + {2*141.0, 2*45.0 /*avg of ring & tree*/, 2*35.0}, /* Blackwell-N1/AMD-N2/AMD-N4) */ + }, + .perChMaxRingLL128Bws = { + {20.0, 20.0, 20.0}, /* Volta (N1/N2/N4) */ + {20.0, 20.0, 20.0}, /* Ampere (N1/N2/N4) */ + {36.7, 36.7, 36.7}, /* Hopper (N1/N2/N4) */ + {2*36.7, 2*36.7, 2*36.7}, /* Blackwell (N1/N2/N4) */ + }, + .perChMaxTreeLL128Bws = { + {20.0, 20.0, 20.0}, /* Volta (N1/N2/N4) */ + {20.0, 20.0, 20.0}, /* Ampere (N1/N2/N4) */ + {36.7, 36.7, 29.0}, /* Hopper (N1/N2/N4) */ + {55.6, 31.67, 20.0}, /* Blackwell (N1/N2/N4) */ + }, + .perChMaxTreeBws = { + {26.5, 18.5, 10.0}, /* Volta (N1/N2/N4) */ + {24.0, 23.6, 17.8}, /* Ampere (N1/N2/N4) */ + {38.7, 41.4, 36.0}, /* Hopper (N1/N2/N4) */ + {70.0, 42.8, 24.0}, /* Blackwell (N1/N2/N4) */ + }, + .perChMaxNVLSTreeBws = { + {26.5, 18.5, 10.0}, /* Volta (N1/N2/N4) */ + {24.0, 23.6, 17.8}, /* Ampere (N1/N2/N4) */ + {0.0, 57.7, 45.5}, /* Hopper (N1/N2/N4) */ + {0.0, 96.0, 43.1} /* Blackwell (N1/N2/N4) */ + } }; NCCL_PARAM(PatEnable, "PAT_ENABLE", 2); @@ -210,6 +220,13 @@ static float getNetOverhead(struct ncclComm* comm) { NCCL_PARAM(Ll128C2c, "LL128_C2C", 1); +ncclResult_t ncclTopoInitTunerConstants(struct ncclComm* comm) { + + comm->tunerConstants = ncclTunerConstantsDefaults; + + return ncclSuccess; +} + ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) { int simpleDefaultThreads = (graphs[NCCL_ALGO_RING]->bwIntra*graphs[NCCL_ALGO_RING]->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS; comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = @@ -229,17 +246,18 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom int nRanks = comm->nRanks; if (nRanks <= 1) return ncclSuccess; - int compCapIndex = minCompCap >= 100 ? BLACKWELL_COMPCAP_IDX : (minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX); + int compCapIndex = minCompCap >= 100 ? NCCL_BLACKWELL_COMPCAP_IDX : (minCompCap >= 90 ? NCCL_HOPPER_COMPCAP_IDX : minCompCap >= 80 ? NCCL_AMPERE_COMPCAP_IDX : NCCL_VOLTA_COMPCAP_IDX); int index2 = nNodes <= 2 ? nNodes-1 : 2; // LL: for single node, we look at GPU type; for multi-node, we look at CPU type int index1 = nNodes == 1 ? compCapIndex : (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD || comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) ? 1 : 0; - double llMaxBw = llMaxBws[index1][index2]; - double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2]; - double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2]; - double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2]; + double llMaxBw = comm->tunerConstants.llMaxBws[index1][index2]; + double perChMaxTreeBw = comm->tunerConstants.perChMaxTreeBws[compCapIndex][index2]; + double perChMaxRingLL128Bw = comm->tunerConstants.perChMaxRingLL128Bws[compCapIndex][index2]; + double perChMaxTreeLL128Bw = comm->tunerConstants.perChMaxTreeLL128Bws[compCapIndex][index2]; + double perChMaxNVLSTreeBw = comm->tunerConstants.perChMaxNVLSTreeBws[compCapIndex][index2]; // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring - if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]; + if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_POWER) comm->tunerConstants.hwLatencies[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->tunerConstants.hwLatencies[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]; float ppn = (float)nRanks / nNodes; int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS]; @@ -264,15 +282,22 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom && a == NCCL_ALGO_PAT && (p != NCCL_PROTO_SIMPLE || ncclPatEnable(comm) == 0)) continue; int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0; float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter; - if (a == NCCL_ALGO_NVLS) { + if (a == NCCL_ALGO_NVLS_TREE || a == NCCL_ALGO_NVLS) + { + // NVLS/NVLStree needs at least 2 channels + if (graphs[a]->nChannels < 2 ) continue; + // Convert to NVLS busBW/channel + float intraBw = graphs[a]->bwIntra * nvlsEfficiency[compCapIndex] * (graphs[a]->nChannels - 1) / graphs[a]->nChannels; + // AllReduce pipelines two operations. if (coll == ncclFuncAllReduce) { - bw = std::min(graphs[a]->bwIntra, graphs[a]->bwInter); + intraBw *= 2.0f; } else { - // allgather and reducescatter - bw = std::min(graphs[a]->bwIntra * (ppn - 1.0f) / ppn, graphs[a]->bwInter * 0.9f); + intraBw *= (ppn - 1) / ppn; } - } - if (a == NCCL_ALGO_NVLS_TREE) bw = std::min(graphs[a]->bwIntra, nNodes <= 2 ? graphs[a]->bwInter : graphs[a]->bwInter/2); + // Handle 2 node case of NVLSTree + float interBw = graphs[a]->bwInter * ((nNodes <= 2 && a == NCCL_ALGO_NVLS_TREE) ? 2 : 1); + bw = std::min( {intraBw, interBw, a == NCCL_ALGO_NVLS_TREE ? (float)perChMaxNVLSTreeBw : std::numeric_limits::max()} ); + }; float busBw = graphs[a]->nChannels * bw; // Various model refinements @@ -320,27 +345,26 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom // Convert bus BW to algorithm BW if (!(a != NCCL_ALGO_RING && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) { float ratio = 1.0f; - if (a == NCCL_ALGO_RING) ratio *= (1.0 * nRanks) / nsteps; - else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= 5.0/6.0; + if (a == NCCL_ALGO_RING || a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= (1.0 * nRanks) / nsteps; else ratio *= .5; busBw *= ratio; } comm->bandwidths[coll][a][p] = busBw; - comm->latencies[coll][a][p] = baseLat[a][p]; - float intraLat = hwLat[intraHw[a]][a][p]; + comm->latencies[coll][a][p] = comm->tunerConstants.baseLatencies[a][p]; + float intraLat = comm->tunerConstants.hwLatencies[intraHw[a]][a][p]; // With ppn=1 latencies are fully exposed, use the Tree network latency - float interLat = ppn == 1 ? hwLat[NCCL_HW_NET][NCCL_ALGO_TREE][p] : hwLat[NCCL_HW_NET][a][p]; + float interLat = ppn == 1 ? comm->tunerConstants.hwLatencies[NCCL_HW_NET][NCCL_ALGO_TREE][p] : comm->tunerConstants.hwLatencies[NCCL_HW_NET][a][p]; interLat += graphs[a]->latencyInter; // Also add the flush extra latency if (p == NCCL_PROTO_SIMPLE) interLat += graphs[a]->latencyInter; if (a == NCCL_ALGO_RING) { - float lat = hwLat[hw[a]][a][p]; + float lat = comm->tunerConstants.hwLatencies[hw[a]][a][p]; if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) { if (graphs[a]->sameChannels) { comm->latencies[coll][a][p] += lat; } else { - if (p == NCCL_PROTO_SIMPLE) lat = hwLat[hw[a]][NCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling + if (p == NCCL_PROTO_SIMPLE) lat = comm->tunerConstants.hwLatencies[hw[a]][NCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling comm->latencies[coll][a][p] += nsteps*lat; } } else { @@ -371,8 +395,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom comm->latencies[coll][a][p] += intraLat + 2 * log2i(nNodes) * interLat; } else if (a == NCCL_ALGO_PAT) { if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) { - comm->latencies[coll][a][p] = 8 // Base time - + log2i(nNodes) * (interLat/3.5) // Log latency + comm->latencies[coll][a][p] += log2i(nNodes) * (interLat/3.5) // Log latency + nRanks * 2.8; // Still a linear part; hopefully we'll manage to remove it at some point. } } diff --git a/src/graph/xml.cc b/src/graph/xml.cc index 96b0c9a7c..010120627 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -917,31 +917,33 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha if (*netNode != NULL) return ncclSuccess; - const char* pciSysPath = pciPath; - if (pciSysPath) { - char subSystem[PATH_MAX]; - NCCLCHECK(ncclTopoGetSubsystem(pciSysPath, subSystem)); - // This is not a PCI device (virtual, usb, ...). - if (strcmp(subSystem, "pci") != 0) { - INFO(NCCL_NET|NCCL_GRAPH, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem); - pciSysPath = NULL; - } - } - struct ncclXmlNode* parent = NULL; if (forceParent) { parent = forceParent; - } else if (pciSysPath) { - int offset; - for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--); - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; - strcpy(busId, pciSysPath+offset+1); - NCCLCHECK(ncclTopoGetPciNode(xml, busId, &parent)); - NCCLCHECK(xmlSetAttrIfUnset(parent, "class", "0x02")); - NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml)); } else { - // Virtual NIC, no PCI device, attach to first CPU - NCCLCHECK(xmlFindTag(xml, "cpu", &parent)); + const char* pciSysPath = pciPath; + if (pciSysPath) { + char subSystem[PATH_MAX]; + NCCLCHECK(ncclTopoGetSubsystem(pciSysPath, subSystem)); + // This is not a PCI device (virtual, usb, ...). + if (strcmp(subSystem, "pci") != 0 && !forceParent) { + INFO(NCCL_NET | NCCL_GRAPH, "Topology detection: network path (name = %s) %s is not a PCI device (%s). Attaching to first CPU", netName, pciSysPath, subSystem); + pciSysPath = NULL; + } + } + + if (pciSysPath) { + int offset; + for (offset = strlen(pciSysPath) - 1; pciSysPath[offset] != '/'; offset--); + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + strcpy(busId, pciSysPath + offset + 1); + NCCLCHECK(ncclTopoGetPciNode(xml, busId, &parent)); + NCCLCHECK(xmlSetAttrIfUnset(parent, "class", "0x02")); + NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml)); + } else { + // Virtual NIC, no PCI device, attach to first CPU + NCCLCHECK(xmlFindTag(xml, "cpu", &parent)); + } } struct ncclXmlNode* nicNode = NULL; diff --git a/src/graph/xml.h b/src/graph/xml.h index ad9f0faff..ac9ef7286 100644 --- a/src/graph/xml.h +++ b/src/graph/xml.h @@ -124,6 +124,13 @@ static ncclResult_t xmlGetAttrUint64(struct ncclXmlNode* node, const char* attrN return ncclSuccess; } +static ncclResult_t xmlGetAttrUint64Default(struct ncclXmlNode* node, const char* attrName, uint64_t* value, uint64_t defaultValue) { + const char* str; + NCCLCHECK(xmlGetAttr(node, attrName, &str)); + *value = str ? strtoull(str, NULL, 0) : defaultValue; + return ncclSuccess; +} + static ncclResult_t xmlGetAttrLong(struct ncclXmlNode* node, const char* attrName, int64_t* value) { const char* str; NCCLCHECK(xmlGetAttrStr(node, attrName, &str)); diff --git a/src/group.cc b/src/group.cc index 08ac54e9e..aa2824412 100644 --- a/src/group.cc +++ b/src/group.cc @@ -11,6 +11,9 @@ #include "channel.h" #include #include "bootstrap.h" +#include "ce_coll.h" +#include "profiler.h" +#include "nvtx.h" #define GROUP_MAX_RECLAIM_STEPS 10 @@ -90,7 +93,7 @@ ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job) { NCCL_API(ncclResult_t, ncclGroupStart); ncclResult_t ncclGroupStart() { ncclResult_t ret = ncclSuccess; - NVTX3_FUNC_RANGE_IN(nccl_domain); + NCCL_NVTX3_FUNC_RANGE; NCCLCHECK(ncclGroupStartInternal()); TRACE_CALL("ncclGroupStart()"); @@ -100,7 +103,7 @@ ncclResult_t ncclGroupStart() { NCCL_API(ncclResult_t, ncclGroupEnd); ncclResult_t ncclGroupEnd() { ncclResult_t ret = ncclSuccess; - NVTX3_FUNC_RANGE_IN(nccl_domain); + NCCL_NVTX3_FUNC_RANGE; NCCLCHECKGOTO(ncclGroupEndInternal(), ret, exit); TRACE_CALL("ncclGroupEnd()"); exit: @@ -110,7 +113,7 @@ ncclResult_t ncclGroupEnd() { NCCL_API(ncclResult_t, ncclGroupSimulateEnd, ncclSimInfo_t* simInfo); ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t* simInfo) { ncclResult_t ret = ncclSuccess; - NVTX3_FUNC_RANGE_IN(nccl_domain); + NCCL_NVTX3_FUNC_RANGE; NCCLCHECKGOTO(ncclGroupEndInternal(simInfo), ret, exit); TRACE_CALL("ncclGroupSimulateEnd()"); exit: @@ -123,64 +126,87 @@ struct ncclPreconnectJob { bool* algoNeedConnect; }; +struct ncclPrepareTasksAndCollPreconnectJob { + struct ncclAsyncJob base; + struct ncclComm* comm; + ncclSimInfo_t* simInfo; +}; + ncclResult_t ncclP2PPreconnectFunc(struct ncclAsyncJob* job_) { struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_; struct ncclComm* comm = job->comm; CUDACHECK(cudaSetDevice(comm->cudaDev)); - if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); + if (!job_->isThreadMain && CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1)); return ncclSuccess; } -ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) { - struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_; - struct ncclComm* comm = job->comm; - ncclResult_t ret = ncclSuccess; - - CUDACHECK(cudaSetDevice(comm->cudaDev)); - if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); +static ncclResult_t ncclCollPreconnect(struct ncclComm* comm, bool* algoNeedConnect) { for (int i = 0; i < NCCL_NUM_ALGORITHMS; ++i) { - if (job->algoNeedConnect[i]) { + if (algoNeedConnect[i]) { switch (i) { case NCCL_ALGO_RING: { - NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail); + NCCLCHECK(ncclTransportRingConnect(comm)); break; } case NCCL_ALGO_TREE: { - NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail); + NCCLCHECK(ncclTransportTreeConnect(comm)); break; } case NCCL_ALGO_NVLS: { /* If we are using NVLS_TREE algo, we must mark NVLS algo to set up * NVLS intra-node buffer */ - NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail); + NCCLCHECK(ncclNvlsBufferSetup(comm)); break; } case NCCL_ALGO_NVLS_TREE: { - NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail); + NCCLCHECK(ncclNvlsTreeConnect(comm)); break; } case NCCL_ALGO_COLLNET_CHAIN: { - NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail); + NCCLCHECK(ncclCollNetChainBufferSetup(comm)); break; } case NCCL_ALGO_COLLNET_DIRECT: { - NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail); + NCCLCHECK(ncclCollNetDirectBufferSetup(comm)); break; } case NCCL_ALGO_PAT: { - NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail); + NCCLCHECK(ncclTransportPatConnect(comm)); break; } // Yes, it's a dead code. That's fine... // coverity[dead_error_begin] default: { - ret = ncclInternalError; - goto fail; + NCCLCHECK(ncclInternalError); } } } } + return ncclSuccess; +} + +ncclResult_t ncclPrepareTasksAndCollPreconnectFunc(struct ncclAsyncJob* job_) { + struct ncclPrepareTasksAndCollPreconnectJob* job = (ncclPrepareTasksAndCollPreconnectJob*)job_; + struct ncclComm* comm = job->comm; + bool needConnect; + bool algoNeedConnect[NCCL_NUM_ALGORITHMS]; + memset(algoNeedConnect, 0, sizeof(bool)*NCCL_NUM_ALGORITHMS); + CUDACHECK(cudaSetDevice(comm->cudaDev)); + if (!job_->isThreadMain && CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); + NCCLCHECK(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, job->simInfo)); + if (comm->cuMemSupport && needConnect) NCCLCHECK(ncclCollPreconnect(comm, algoNeedConnect)); + return ncclSuccess; +} + +ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) { + struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_; + struct ncclComm* comm = job->comm; + ncclResult_t ret = ncclSuccess; + + if (!job_->isThreadMain) CUDACHECK(cudaSetDevice(comm->cudaDev)); + if (!job_->isThreadMain && CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); + NCCLCHECKGOTO(ncclCollPreconnect(comm, job->algoNeedConnect), ret, fail); exit: free(job->algoNeedConnect); @@ -194,52 +220,33 @@ struct ncclGroupSymmetricJob { struct ncclComm* comm; }; -NCCL_PARAM(WinStride, "WIN_STRIDE", -1); - ncclResult_t ncclCommGroupRegisterSymmetric(struct ncclAsyncJob* job_) { struct ncclGroupSymmetricJob* job = (struct ncclGroupSymmetricJob*)job_; struct ncclComm* comm = job->comm; ncclResult_t ret = ncclSuccess; CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail); - if (comm->baseStride == 0) { - cudaStream_t hostStream; - // first time to allocate symmetric VA space. - // calling into this function means symmetric is supported. - struct ncclSymDevBase* symBase = NULL; - size_t size = ncclSymDevBase::size(comm->localRanks); - if (ncclParamWinStride() != -1) { - comm->baseStride = ncclParamWinStride(); - } else { - size_t maxStride = 0; - for (int r = 0; r < comm->nRanks; ++r) - if (comm->peerInfo[r].totalGlobalMem > maxStride) maxStride = comm->peerInfo[r].totalGlobalMem; - comm->baseStride = maxStride; - } - INFO(NCCL_INIT, "rank %d base stride %zuGB total VM %zuGB", comm->rank, comm->baseStride >> 30, (comm->baseStride * comm->localRanks) >> 30); - NCCLCHECKGOTO(ncclIpcSymmetricInit(comm), ret, fail); - NCCLCHECKGOTO(ncclNvlsSymmetricInit(comm), ret, fail); - comm->symAllocHead = 0; - - // Allocate symmetric memory for NCCL internal usage - NCCLCHECKGOTO(ncclCommSymmetricAllocInternal(comm, size, alignof(struct ncclSymDevBase), (void**)&symBase), ret, fail); - assert((void*)symBase == (void*)(comm->baseUCSymPtr + comm->localRank * comm->baseStride)); - NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), ret, fail); - CUDACHECKGOTO(cudaMemsetAsync(symBase, 0, size, hostStream), ret, fail); - CUDACHECKGOTO(cudaStreamSynchronize(hostStream), ret, fail); - NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), ret, fail); - - comm->symDevComm.base = (struct ncclSymDevBase*)(comm->baseUCSymPtr + comm->localRank * comm->baseStride); - comm->symDevComm.baseMc = (struct ncclSymDevBase*)comm->baseMCSymPtr; - comm->symDevComm.nRanks = comm->localRanks; - comm->symDevComm.nRanks_rcp32 = idivRcp32(comm->localRanks); - comm->symDevComm.rank = comm->localRank; - comm->symDevComm.stride4G = comm->baseStride >> 32; + + while (!ncclIntruQueueEmpty(&comm->devrState.regTaskQueue)) { + struct ncclDevrRegTask* task = ncclIntruQueueDequeue(&comm->devrState.regTaskQueue); + NCCLCHECKGOTO(ncclDevrWindowRegisterInGroup( + comm, task->userPtr, task->userSize, task->winFlags, task->outWinDev), + ret, fail); + free(task); } - while (!ncclIntruQueueEmpty(&comm->symRegTaskQueue)) { - struct ncclSymRegTask* task = ncclIntruQueueDequeue(&comm->symRegTaskQueue); - NCCLCHECKGOTO(ncclCommSymmetricRegisterInternal(comm, task->buff, task->baseSize, task->alignment, task->memHandle, task->regHandle), ret, fail); + while (!ncclIntruQueueEmpty(&comm->devrState.commCreateTaskQueue)) { + struct ncclDevrCommCreateTask* task = ncclIntruQueueDequeue(&comm->devrState.commCreateTaskQueue); + NCCLCHECKGOTO(ncclDevrCommCreateInternal( + comm, (struct ncclDevCommRequirements const*)task->reqs, task->outDevComm), + ret, fail); + freeDevCommRequirements(task->reqs); // free additional task memory for reqs + free(task); + } + + while (!ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) { + struct ncclCeInitTask* task = ncclIntruQueueDequeue(&comm->ceInitTaskQueue); + NCCLCHECKGOTO(ncclCeInit(task->comm), ret, fail); free(task); } @@ -296,7 +303,11 @@ static ncclResult_t doLaunches(struct ncclComm* head) { comm->planner.unlaunchedPlansHead = plan->next; CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure); NCCLCHECKGOTO(ncclLaunchKernelBefore_NoUncapturedCuda(comm, plan), result, failure); - NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure); + if (plan->isCeColl) { + NCCLCHECKGOTO(ncclLaunchCeColl(comm, plan), result, failure); + } else { + NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure); + } } // Barrier reduction input indicates if we require further rounds. if (useBarrier) ncclCommIntraBarrierIn(comm, comm->planner.unlaunchedPlansHead != nullptr ? 1 : 0); @@ -392,6 +403,12 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueuenext == nullptr) { + job->isThreadMain = true; + ncclAsyncJobMain(job); + job->state = ncclGroupJobJoined; + return job->result; + } do { PTHREADCHECKGOTO(pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job), "pthread_create", ret, fail); job = job->next; @@ -444,6 +461,51 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueue* asyncCollJobs) { + if (ncclParamSingleProcMemRegEnable()) { + struct ncclPrepareTasksAndCollPreconnectJob* job; + NCCLCHECK(ncclCalloc(&job, 1)); + job->base.func = ncclPrepareTasksAndCollPreconnectFunc; + job->base.undo = nullptr; + job->base.destructor = free; + job->base.state = ncclGroupJobRunning; + job->base.abortFlag = comm->abortFlag; + job->base.abortFlagDev = comm->abortFlagDev; + job->comm = comm; + job->simInfo = simInfo; + ncclIntruQueueEnqueue(asyncCollJobs, &job->base); + } else { + bool needConnect = false; + bool algoNeedConnect[NCCL_NUM_ALGORITHMS]; + memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS); + + CUDACHECK(cudaSetDevice(comm->cudaDev)); + NCCLCHECK(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo)); + + if (comm->cuMemSupport && needConnect) { + ncclResult_t ret; + struct ncclPreconnectJob* job; + NCCLCHECK(ncclCalloc(&job, 1)); + job->base.func = ncclCollPreconnectFunc; + job->base.undo = nullptr; + job->base.destructor = free; + job->base.state = ncclGroupJobRunning; + job->base.abortFlag = comm->abortFlag; + job->base.abortFlagDev = comm->abortFlagDev; + job->comm = comm; + if ((ret = ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS))) { + free(job); + NCCLCHECK(ret); + } + memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS); + ncclIntruQueueEnqueue(asyncCollJobs, &job->base); + } + } + return ncclSuccess; +} + static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInfo = NULL) { ncclResult_t ret = ncclSuccess; struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_; @@ -518,27 +580,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf // at the same time. comm = cliqueHead; do { - bool needConnect = false; - bool algoNeedConnect[NCCL_NUM_ALGORITHMS]; - memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS); - - CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail); - NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail); - - if (comm->cuMemSupport && needConnect) { - struct ncclPreconnectJob* job; - NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail); - job->base.func = ncclCollPreconnectFunc; - job->base.undo = nullptr; - job->base.destructor = free; - job->base.state = ncclGroupJobRunning; - job->base.abortFlag = comm->abortFlag; - job->base.abortFlagDev = comm->abortFlagDev; - job->comm = comm; - NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail); - memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS); - ncclIntruQueueEnqueue(&asyncCollJobs, &job->base); - } + NCCLCHECKGOTO(ncclPrepareTasksAndCollPreconnect(comm, simInfo, &asyncCollJobs), ret, fail); comm = comm->groupNext[ncclGroupTaskTypeCollective]; } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0); // connect @@ -617,6 +659,13 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) { goto exit; } + if (ncclProfilerApiState.profilerGroupDepth > 0) { + ncclProfilerApiState.profilerGroupDepth--; + } + if (ncclProfilerApiState.profilerGroupDepth == 0) { + NCCLCHECK(ncclProfilerRecordGroupApiEventState(ncclProfilerGroupEndApiStart)); + } + if ((--ncclGroupDepth) > 0) goto exit; if ((ret = ncclGroupError) != ncclSuccess) goto fail; @@ -701,6 +750,8 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) { groupLocalResetJobState(); exit: + // Profiler group API start is called inside taskAppend to get graph capture information for the event + NCCLCHECK(ncclProfilerStopGroupApiEvent()); return ret; fail: if (groupJob) { diff --git a/src/include/allocator.h b/src/include/allocator.h index 189c3d4e2..05da29a62 100644 --- a/src/include/allocator.h +++ b/src/include/allocator.h @@ -7,7 +7,55 @@ #ifndef NCCL_ALLOCATOR_H_ #define NCCL_ALLOCATOR_H_ -ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr); -ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr); +//////////////////////////////////////////////////////////////////////////////// +// ncclSpace: Allocates contiguous segments of non-negative integers. Useful +// as a memory allocator when we can't put allocator state within the memory +// being allocated. + +struct ncclSpace { + int count; + int capacity; + int64_t* cuts; +}; + +void ncclSpaceConstruct(struct ncclSpace* a); +void ncclSpaceDestruct(struct ncclSpace* a); +ncclResult_t ncclSpaceAlloc(struct ncclSpace* a, int64_t spaceLimit, int64_t objSize, int objAlign, int64_t* outObjOffset); +ncclResult_t ncclSpaceFree(struct ncclSpace* a, int64_t objOffset, int64_t objSize); + + +//////////////////////////////////////////////////////////////////////////////// +// ncclShadowPool: Allocates device-side objects, their host-side shadows, and +// maintains the device->host object address mapping. + +struct ncclShadowObject; +struct ncclShadowPage; +struct ncclShadowPool { + int count, hbits; + struct ncclShadowObject** table; + cudaMemPool_t memPool; + struct ncclShadowPage* pages; +}; + +void ncclShadowPoolConstruct(struct ncclShadowPool*); +ncclResult_t ncclShadowPoolDestruct(struct ncclShadowPool*); +ncclResult_t ncclShadowPoolAlloc(struct ncclShadowPool*, size_t size, void** outDevObj, void** outHostObj, cudaStream_t stream); +ncclResult_t ncclShadowPoolFree(struct ncclShadowPool*, void* devObj, cudaStream_t stream); +ncclResult_t ncclShadowPoolToHost(struct ncclShadowPool*, void* devObj, void** outHostObj); + +template +static inline ncclResult_t ncclShadowPoolAlloc(struct ncclShadowPool* pool, T** outDevObj, T** outHostObj, cudaStream_t stream) { + void* devObj; + void* hostObj; + ncclResult_t got = ncclShadowPoolAlloc(pool, sizeof(T), &devObj, &hostObj, stream); + if (outDevObj) *outDevObj = (T*)devObj; + if (outHostObj) *outHostObj = (T*)hostObj; + return got; +} + +template +static inline ncclResult_t ncclShadowPoolToHost(struct ncclShadowPool* pool, T* devObj, T** hostObj) { + return ncclShadowPoolToHost(pool, (void*)devObj, (void**)hostObj); +} #endif diff --git a/src/include/bitops.h b/src/include/bitops.h index 71053ed49..badc91b50 100644 --- a/src/include/bitops.h +++ b/src/include/bitops.h @@ -41,6 +41,9 @@ constexpr static __host__ __device__ Int maxval(Int a, Int b, More ...more) { #endif } +#define BIT(x) (1UL << (x)) +#define MASK(x) ((1UL << x) - 1UL) + #define DIVUP(x, y) \ (((x)+(y)-1)/(y)) @@ -68,14 +71,26 @@ static __host__ __device__ constexpr Z roundDown(X x, Y y) { } // assumes second argument is a power of 2 -template -static __host__ __device__ constexpr Z alignUp(X x, int a) { - return (x + a-1) & Z(-a); +template +static __host__ __device__ constexpr Z alignUp(X x, Y a) { + return (x + a-1) & -Z(a); } +template +static __host__ __device__ T* alignUp(T* x, size_t a) { + static_assert(sizeof(T) == 1, "Only single byte types allowed."); + return reinterpret_cast((reinterpret_cast(x) + a-1) & -uintptr_t(a)); +} + // assumes second argument is a power of 2 -template -static __host__ __device__ constexpr Z alignDown(X x, int a) { - return x & Z(-a); +template +static __host__ __device__ constexpr Z alignDown(X x, Y a) { + return x & -Z(a); +} + +template +static __host__ __device__ T* alignDown(T* x, size_t a) { + static_assert(sizeof(T) == 1, "Only single byte types allowed."); + return reinterpret_cast(reinterpret_cast(x) & -uintptr_t(a)); } template @@ -341,7 +356,7 @@ static __host__ UInt reverseSubBits(UInt x) { default: static_assert(8*sizeof(UInt) <= 64, "Unsupported integer type."); } return reverseSubBits(x); - } else if (nSubBits == 1) { + } else if (nSubBits <= 1) { return x; } else { UInt m = UInt(-1)/((UInt(1)<<(nSubBits/2))+1); diff --git a/src/include/ce_coll.h b/src/include/ce_coll.h new file mode 100644 index 000000000..e47effb8c --- /dev/null +++ b/src/include/ce_coll.h @@ -0,0 +1,76 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_CE_COLL_H_ +#define NCCL_CE_COLL_H_ + +#include "nccl.h" +#include "nccl_common.h" +#include "bitops.h" + +// Memory operations per rank for different synchronization protocols +#define NCCL_CE_SYNC_OPS_PER_RANK_MC 2 +#define NCCL_CE_SYNC_OPS_PER_RANK_UC 3 + +struct ncclCeColl { + uint8_t* baseUCSymReadyPtr; + uint8_t* baseUCSymComplPtr; + size_t baseUCSymReadyOffset; + size_t baseUCSymComplOffset; + uint32_t ceSeqNum; + bool useCompletePtr; + uint32_t intraBatchSyncFreq; + uint64_t intraBatchSyncMsgThreshold; + struct ncclDevrWindow* ceSyncWin; +}; + +struct ncclCeInitTask { + struct ncclCeInitTask *next; + struct ncclComm* comm; +}; + +struct alignas(16) ncclCeCollArgs { + ncclFunc_t func; + int rootRank; + size_t nElts; + size_t eltSize; + uint8_t* sendBuff; + uint8_t* recvBuff; + struct ncclDevrWindow* sendWin; + struct ncclDevrWindow* recvWin; +}; + +struct ncclCeBatchOpsParams { + void** dsts; + void** srcs; + size_t* sizes; + size_t numOps; + bool intraBatchSync; +#if CUDART_VERSION >= 12080 + cudaMemcpyAttributes* attrs; + size_t* attrIdxs; + size_t numAttrs; +#endif +}; + +bool ncclCeImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty); + +ncclResult_t ncclCeInit(struct ncclComm* comm); + +ncclResult_t ncclCeFinalize(struct ncclComm* comm); + +ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream); + +ncclResult_t ncclLaunchCeColl(struct ncclComm* comm, struct ncclKernelPlan* plan); + +ncclResult_t ncclCeAllGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream); + +ncclResult_t ncclCeScatter(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream); + +ncclResult_t ncclCeGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream); + +ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream); +#endif /* NCCL_CE_COLL_H_ */ diff --git a/src/include/channel.h b/src/include/channel.h index ee9aa6d0b..bd34f54c1 100644 --- a/src/include/channel.h +++ b/src/include/channel.h @@ -17,15 +17,16 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks); inline uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound) { + int base; if (comm->nNodes > 1) { int nodeDelta = p2pRound/comm->maxLocalRanks; int localDelta = p2pRound%comm->maxLocalRanks; - int base = nodeDelta*divUp(comm->maxLocalRanks, NCCL_MAX_DEV_WORK_P2P_PER_BATCH); + base = nodeDelta*divUp(comm->maxLocalRanks, NCCL_MAX_DEV_WORK_P2P_PER_BATCH); base += localDelta/NCCL_MAX_DEV_WORK_P2P_PER_BATCH; - return base & 0xff; } else { - return p2pRound & 0xff; + base = p2pRound; } + return reverseBits(base, log2Up(comm->p2pnChannels)); } #endif diff --git a/src/include/coll_net.h b/src/include/coll_net.h index affbf0a24..574fd95eb 100644 --- a/src/include/coll_net.h +++ b/src/include/coll_net.h @@ -16,7 +16,7 @@ typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; } static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; } static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; } -static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; } +static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(comm->collNetContext, dev, handle, listenComm)); return ncclSuccess; } static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; } static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; } static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, size_t size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; } @@ -29,6 +29,7 @@ static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* d static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; } static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; } static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; } +static ncclResult_t collNetFinalize(struct ncclComm* comm, void* ctx) { NCCLCHECK(comm->ncclCollNet->finalize(ctx)); return ncclSuccess; } static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; } diff --git a/src/include/collectives.h b/src/include/collectives.h index c68b0418c..038eb8dd1 100644 --- a/src/include/collectives.h +++ b/src/include/collectives.h @@ -8,7 +8,7 @@ #define NCCL_COLLECTIVES_H_ #include "nccl.h" -#include "nccl_common.h" +#include "nccl_tuner.h" #include "device.h" #define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two. @@ -18,10 +18,16 @@ #define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2) #define ALLGATHER_SLICESTEPS (NCCL_STEPS/4) #define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2) +#define ALLTOALL_SLICESTEPS 1 +#define ALLTOALL_CHUNKSTEPS 1 #define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4) #define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2) #define BROADCAST_SLICESTEPS 1 #define BROADCAST_CHUNKSTEPS 1 +#define GATHER_SLICESTEPS 1 +#define GATHER_CHUNKSTEPS 1 +#define SCATTER_SLICESTEPS 1 +#define SCATTER_CHUNKSTEPS 1 #define REDUCE_SLICESTEPS 1 #define REDUCE_CHUNKSTEPS 1 #define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above diff --git a/src/include/comm.h b/src/include/comm.h index 1378e0765..22faf3682 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -18,6 +18,9 @@ #include "graph.h" #include "profiler.h" #include "allocator.h" +#include "dev_runtime.h" +#include "sym_kernels.h" +#include "ce_coll.h" #if CUDART_VERSION < 9000 struct cudaLaunchParams { @@ -198,12 +201,14 @@ struct ncclTaskColl { int32_t nMaxChannels:8; int32_t nWarps:8; int32_t algorithm:8, protocol:8; - uint32_t isCollnet:1, isNvls:1; - uint32_t devFuncId:30; + uint32_t isCollnet:1, isNvls:1, isSymLast:1; + uint32_t devFuncId:29; int regBufType; // number of elements in planner->ipcMemQueue associated with this collective int nCleanupQueueElts; + struct ncclDevrWindow* sendWin; + struct ncclDevrWindow* recvWin; void* sendMhandle; void* recvMhandle; void** sendNetHandles; @@ -217,12 +222,16 @@ struct ncclTaskColl { // Profiler plugin int eActivationMask; + void* groupApiEventHandle; + void* collApiEventHandle; void* eventHandle; uint8_t nChannels; }; + struct ncclTaskP2p { struct ncclTaskP2p* next; ncclFunc_t func; + ncclFunc_t collAPI; void* buff; size_t count; ncclDataType_t datatype; @@ -231,6 +240,8 @@ struct ncclTaskP2p { // Profiler plugin int eActivationMask; + void* groupApiEventHandle; + void* p2pApiEventHandle; void* eventHandle; uint8_t nChannels; }; @@ -246,12 +257,14 @@ struct ncclKernelPlan { bool persistent; // aka captured in a graph bool isHostCbEnq; bool isSymColl; + bool isCeColl; enum ncclDevWorkStorageType workStorageType; bool kernelSpecialized; void* kernelFn; union { struct ncclDevKernelArgs* kernelArgs; - struct ncclSymDevArgs* kernelSymArgs; + void* kernelSymArgs; + struct ncclCeCollArgs* ceCollArgs; }; size_t kernelArgsSize; uint64_t channelMask; // bitset of which channels are present @@ -270,6 +283,8 @@ struct ncclKernelPlan { struct ncclIntruQueue proxyOpQueue; // Profiler plugin + void* groupApiEventHandle; + void* kernelLaunchEventHandle; void* groupEventHandle; }; @@ -360,9 +375,8 @@ struct ncclKernelPlanner { struct ncclTaskCollSorter collSorter; struct Peer* peers/*[nRanks]*/; int nTasksColl, nTasksP2p; + int nTasksP2pSend, nTasksP2pRecv; bool persistent; - bool isSymColl; - // The list of user streams aggregated over all tasks present. struct ncclCudaStreamList* streams; // The most recent user stream. Ignored if streams==nullptr @@ -378,6 +392,8 @@ struct ncclKernelPlanner { ////////////////////////////////////////////////////////////////////////////// struct ncclIntruQueue collTaskQueue; + struct ncclIntruQueue collCeTaskQueue; + struct ncclIntruQueue collSymTaskQueue; struct ncclIntruQueue collWorkQueue; struct ncclIntruQueue tmpCollWorkQueue; struct ncclIntruQueue collCleanupQueue; @@ -417,6 +433,8 @@ typedef enum ncclGroupTaskType { ncclGroupTaskTypeNum = 2, } ncclGroupTaskType_t; +struct ncclCommSymTeams; + struct ncclComm { uint64_t startMagic; struct ncclMemoryStack memPermanent, memScoped; @@ -436,10 +454,12 @@ struct ncclComm { bool peerInfoValid; ncclNet_t* ncclNet; + void* netContext; int netPluginIndex; int ncclNetVer; ncclNetDeviceType netDeviceType; ncclCollNet_t* ncclCollNet; + void* collNetContext; void* bootstrap; // Bitmasks for ncclTransportP2pSetup uint64_t* connectSend; @@ -472,6 +492,7 @@ struct ncclComm { int localRank; int localRanks; int maxLocalRanks; + int minLocalRanks; int* rankToNode; int* rankToLocalRank; int* localRankToRank; @@ -482,6 +503,9 @@ struct ncclComm { struct cliqueInfo clique; // Our MNNVL clique information int cliqueRank; // Our rank within the MNNVL clique + // NVL Domain info + ncclNvlDomainInfo_v5_t nvlDomainInfo; + bool checkPointers; bool dmaBufSupport; @@ -508,7 +532,8 @@ struct ncclComm { int p2pChunkSize; int nvlsChunkSize; - // Algorithm/Protocols thresholds + // Tuner values + ncclTunerConstants_t tunerConstants; ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; @@ -527,8 +552,7 @@ struct ncclComm { uint32_t destroyFlag; // Device side of the communicator (for cudaFree's) - struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm - struct ncclSymDevComm symDevComm; + struct ncclKernelComm* devComm; // actually = &ncclKernelCommAndChannels::comm uint32_t workArgsBytes; // max size of kernel args uint32_t workFifoBytes; // size of workFifoBuf, power of 2 @@ -624,6 +648,10 @@ struct ncclComm { uint64_t seqNumber[NCCL_NUM_FUNCTIONS]; struct ncclProfilerProxy profiler; + // CE Collective + struct ncclCeColl ceColl; + struct ncclIntruQueue ceInitTaskQueue; + // buffer registration cache struct ncclRegCache regCache; int isAllNvlink; @@ -632,13 +660,10 @@ struct ncclComm { bool useNetPXN; bool useGdr; int splitCount; - // symmetric buffer - uint8_t* baseUCSymPtr; - uint8_t* baseMCSymPtr; - size_t baseStride; - size_t symAllocHead; - CUmemGenericAllocationHandle symMCHandle; - struct ncclIntruQueue symRegTaskQueue; + + struct ncclDevrState devrState; // The symmetric runtime state + struct ncclSymkState symkState; // The symmetric kernels state (built on previous) + uint64_t endMagic; }; diff --git a/src/include/core.h b/src/include/core.h index a1754beeb..2ce1d8e78 100644 --- a/src/include/core.h +++ b/src/include/core.h @@ -16,6 +16,7 @@ #ifdef PROFAPI #define NCCL_API(ret, func, args...) \ + extern "C" \ __attribute__ ((visibility("default"))) \ __attribute__ ((alias(#func))) \ ret p##func (args); \ diff --git a/src/include/cpuset.h b/src/include/cpuset.h index 99e3edf4d..df936a31e 100644 --- a/src/include/cpuset.h +++ b/src/include/cpuset.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,54 +7,38 @@ #ifndef NCCL_CPUSET_H_ #define NCCL_CPUSET_H_ -// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t +#include "nccl.h" +#include +#include +#include +#include -static int hexToInt(char c) { - int v = c - '0'; - if (v < 0) return -1; - if (v > 9) v = 10 + c - 'a'; - if ((v < 0) || (v > 15)) return -1; - return v; -} +// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t. +// The bitmask is divided into chunks of 32 bits, each of them represented by 8 hex number. +#define U32_LEN 32 // using uint32_t +#define CPU_SET_N_U32 (CPU_SETSIZE / U32_LEN) -#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t)) +static ncclResult_t ncclStrToCpuset(const char* maskStr, cpu_set_t* set) { + uint32_t cpumasks[CPU_SET_N_U32] = {0}; -static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) { - uint32_t cpumasks[CPU_SET_N_U32]; - int m = CPU_SET_N_U32-1; - cpumasks[m] = 0; - for (int o=0; o 0) { + cpumasks[--m] = strtoul(token, NULL, /*base = hex*/ 16); + token = strtok(NULL, ","); } - // Copy cpumasks to mask - for (int a=0; m=0; o--) { - if (c == 0 && m8[o] == 0) continue; - sprintf(str+c, "%02x", m8[o]); - c+=2; - if (o && o%4 == 0) { - sprintf(str+c, ","); - c++; + // list all the CPUs as part of the CPU set, starting with the lowest mask (= current value of m) + CPU_ZERO(set); + for (int a = 0; (a + m) < CPU_SET_N_U32; a++) { + // each mask is U32_LEN CPUs, list them all if the bit is on + for (int i = 0; i < U32_LEN; ++i) { + if (cpumasks[a + m] & (1UL << i)) CPU_SET(i + a * U32_LEN, set); } } - str[c] = '\0'; return ncclSuccess; } @@ -83,4 +67,31 @@ static char* ncclCpusetToRangeStr(cpu_set_t* mask, char* str, size_t len) { return str; } +static ncclResult_t ncclStrListToCpuset(const char* userStr, cpu_set_t* mask) { + // reset the CPU set + CPU_ZERO(mask); + const char delim[] = ","; + char* str = strdup(userStr); + char* token = strtok(str, delim); + while (token != NULL) { + uint64_t cpu = strtoull(token, NULL, 0); + CPU_SET(cpu, mask); + token = strtok(NULL, delim); + } + free(str); + return ncclSuccess; +} + +static ncclResult_t ncclCpusetToStrList(cpu_set_t* mask, char* str, size_t len) { + if (len == 0) return ncclSuccess; + str[0] = '\0'; + int count = 0; + for (uint64_t id = 0; id < CPU_SETSIZE; ++id) { + if (CPU_ISSET(id, mask)) { + snprintf(str + strlen(str), len - strlen(str), "%s%lu", (count++ == 0) ? "" : ",", id); + } + } + return ncclSuccess; +} + #endif diff --git a/src/include/cudawrap.h b/src/include/cudawrap.h index 2edc60f21..f05f13e43 100644 --- a/src/include/cudawrap.h +++ b/src/include/cudawrap.h @@ -114,6 +114,12 @@ DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010); DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010); DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010); #endif +/* Stream-MemOp support */ +DECLARE_CUDA_PFN_EXTERN(cuStreamBatchMemOp, 11070); +DECLARE_CUDA_PFN_EXTERN(cuStreamWaitValue32, 11070); +DECLARE_CUDA_PFN_EXTERN(cuStreamWaitValue64, 11070); +DECLARE_CUDA_PFN_EXTERN(cuStreamWriteValue32, 11070); +DECLARE_CUDA_PFN_EXTERN(cuStreamWriteValue64, 11070); #endif ncclResult_t ncclCudaLibraryInit(void); diff --git a/src/include/debug.h b/src/include/debug.h index 4e50cbf5a..3822e8760 100644 --- a/src/include/debug.h +++ b/src/include/debug.h @@ -17,6 +17,7 @@ #define NCCL_THREAD_NAMELEN 16 extern int ncclDebugLevel; +extern uint64_t ncclDebugMask; extern FILE *ncclDebugFile; void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6))); @@ -27,11 +28,30 @@ extern char ncclLastError[]; #define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) -#define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) -#define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__) + +#define INFO(FLAGS, ...) \ + do{ \ + int level = __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE); \ + if((level >= NCCL_LOG_INFO && ((unsigned long)(FLAGS) & ncclDebugMask)) || (level < 0)) \ + ncclDebugLog(NCCL_LOG_INFO, (unsigned long)(FLAGS), __func__, __LINE__, __VA_ARGS__); \ + } while(0) + +#define TRACE_CALL(...) \ + do { \ + int level = __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE); \ + if((level >= NCCL_LOG_TRACE && (NCCL_CALL & ncclDebugMask)) || (level < 0)) { \ + ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__); \ + } \ + } while (0) #ifdef ENABLE_TRACE -#define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__) +#define TRACE(FLAGS, ...) \ + do { \ + int level = __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE); \ + if ((level >= NCCL_LOG_TRACE && ((unsigned long)(FLAGS) & ncclDebugMask)) || (level < 0)) { \ + ncclDebugLog(NCCL_LOG_TRACE, (unsigned long)(FLAGS), __func__, __LINE__, __VA_ARGS__); \ + } \ + } while (0) #else #define TRACE(...) #endif diff --git a/src/include/dev_runtime.h b/src/include/dev_runtime.h new file mode 100644 index 000000000..5f6e66e33 --- /dev/null +++ b/src/include/dev_runtime.h @@ -0,0 +1,92 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_DEVICE_RUNTIME_H_ +#define NCCL_DEVICE_RUNTIME_H_ +#include "nccl.h" +#include "nccl_device.h" +#include "nccl_common.h" +#include "allocator.h" +#include "bitops.h" +#include "utils.h" + +//////////////////////////////////////////////////////////////////////////////// +// ncclDevr[_]: runtime implements for symmetric API. + +struct ncclDevrMemory; +struct ncclDevrWindow { + struct ncclDevrMemory* memory; + void* userPtr; + size_t size; + size_t bigOffset; // Offset in big VA space. + int winFlags; + void* localRegHandle; + struct ncclWindow_vidmem* vidmem; +}; +struct ncclDevrWindowSorted; +struct ncclDevrTeam; + +struct ncclDevrRegTask { + struct ncclDevrRegTask *next; + void* userPtr; + size_t userSize; + int winFlags; + ncclWindow_t* outWinDev; +}; + +struct ncclDevrCommCreateTask { + struct ncclDevrCommCreateTask *next; + struct ncclDevCommRequirements* reqs; + struct ncclDevComm* outDevComm; +}; + +struct ncclDevrState { + // Like localRank/localRanks except "lsa" ranks must be consecutive in the world + // and all lsa subsets have the same number of ranks. If any condition is + // false then the lsa team is just the singleton of self. + int lsaSelf; + int lsaSize; + int* lsaRankList; + + size_t granularity; // cuMemGetAllocationGranularity + struct ncclDevrMemory* memHead; + struct ncclDevrWindowSorted* winSorted; + int winSortedCapacity, winSortedCount; + struct ncclDevrTeam* teamHead; + size_t bigSize; // size of our big logical space (128GB?) + struct ncclSpace bigSpace; // allocates our big VA space. + void* lsaFlatBase; // base ptr for all lsa ranks big VA's concatenated together: size = lsaRanks*bigSize + struct ncclShadowPool shadows; + struct ncclDevCommWindowTable* windowTable; + + struct ncclIntruQueue regTaskQueue; + struct ncclIntruQueue commCreateTaskQueue; +}; + +// We assume ncclComm has a `ncclDevrState symState` member. +ncclResult_t ncclDevrInitOnce(struct ncclComm* comm); +ncclResult_t ncclDevrFinalize(struct ncclComm* comm); + +// If found *outWinHost will be populated and *outWinId >= 0, otherwise *outWinId == -1 +ncclResult_t ncclDevrFindWindow(struct ncclComm* comm, void const* userPtr, struct ncclDevrWindow** outWin); + +ncclResult_t ncclDevrWindowRegisterInGroup( + struct ncclComm* comm, void* ptr, size_t size, int winFlags, ncclWindow_t* outWinDev +); + +ncclResult_t ncclDevrCommCreateInternal( + struct ncclComm* comm, struct ncclDevCommRequirements const* reqs, struct ncclDevComm* outDevComm +); +void freeDevCommRequirements( + struct ncclDevCommRequirements* reqs +); + +// Get the corresponding pointer in another lsa rank's symmetric memory window +ncclResult_t ncclDevrGetLsaRankPtr(struct ncclComm* comm, struct ncclDevrWindow* winHost, size_t offset, int lsaRank, void** outPtr); + +// Get the multicast address for a given team +ncclResult_t ncclDevrGetLsaTeamPtrMC(struct ncclComm* comm, struct ncclDevrWindow* winHost, size_t offset, struct ncclTeam lsaTeam, void** outPtr); +#endif diff --git a/src/include/device.h b/src/include/device.h index 2c5ce1029..9ffc26095 100644 --- a/src/include/device.h +++ b/src/include/device.h @@ -8,9 +8,8 @@ #define NCCL_DEVICE_H_ #include "nccl.h" -#include "nccl_common.h" +#include "nccl_tuner.h" #include "bitops.h" -#include "symmetric.h" #include #include #include @@ -159,6 +158,7 @@ struct ncclProxyConnector { struct ncclConnector { int connected; int hasSeen; + int p2pOnly; struct ncclProxyConnector proxyConn; struct ncclTransportComm* transportComm; void* transportResources; @@ -228,7 +228,7 @@ struct ncclChannelPeer { int refCount; }; -struct ncclDevComm; +struct ncclKernelComm; struct alignas(16) ncclDevWorkP2p { void *sendAddr, *recvAddr; @@ -267,16 +267,10 @@ inline __host__ uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2 // ncclP2pChannelToPart and ncclP2pChannelForPart are inverses. The device code // uses ncclP2pChannelToPart to determine which part "this" channel is responsible for. inline __host__ int ncclP2pChannelForPart(int nP2pChannels, int base, int part) { - // Only works because nP2pChannels is pow2 - int nChannelsLog2 = countOneBits(nP2pChannels-1); - int delta = reverseBits(part, nChannelsLog2); - return (base + delta) & (nP2pChannels-1); + return (base + part) & (nP2pChannels-1); } inline __device__ int ncclP2pChannelToPart(int nP2pChannels, int base, int channel) { - // Only works because nP2pChannels is pow2 - int nChannelsLog2 = countOneBits(nP2pChannels-1); - int delta = (channel-base) & (nP2pChannels-1); - return reverseBits(delta, nChannelsLog2); + return (channel - base) & (nP2pChannels-1); } struct alignas(16) ncclDevWorkColl { @@ -413,7 +407,7 @@ struct ncclDevProfiler { } data[MAX_PROFILER_EVENTS_PER_CHANNEL]; }; -struct ncclDevComm { +struct ncclKernelComm { int rank; int nRanks; int node; @@ -436,8 +430,8 @@ struct ncclDevComm { struct ncclDevProfiler* workCompleted/*[MAXCHANNELS]*/; }; -struct alignas(16) ncclDevCommAndChannels { - struct ncclDevComm comm; +struct alignas(16) ncclKernelCommAndChannels { + struct ncclKernelComm comm; struct ncclDevChannel channels[MAXCHANNELS]; }; @@ -448,7 +442,7 @@ enum ncclDevWorkStorageType: uint8_t { }; struct alignas(16) ncclDevKernelArgs { - struct ncclDevComm* comm; + struct ncclKernelComm* comm; uint64_t channelMask; enum ncclDevWorkStorageType workStorageType; uint32_t workMask; diff --git a/src/include/graph.h b/src/include/graph.h index 7475e5a7b..6b926717e 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -115,6 +115,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs); struct ncclTopoRanks { + int crossNicRing; int ringRecv[MAXCHANNELS]; int ringSend[MAXCHANNELS]; int ringPrev[MAXCHANNELS]; @@ -131,6 +132,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent); +ncclResult_t ncclTopoInitTunerConstants(struct ncclComm* comm); ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs); ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time); diff --git a/src/include/group.h b/src/include/group.h index 033a187da..6e317c6c4 100644 --- a/src/include/group.h +++ b/src/include/group.h @@ -43,6 +43,7 @@ struct ncclAsyncJob { uint32_t* childAbortFlagDev; /* point to child abortFlagDev */ ncclComm_t comm; int destroyFlag; + bool isThreadMain; }; ncclResult_t ncclAsyncLaunch( diff --git a/src/include/nccl_common.h b/src/include/nccl_common.h index 0f387c15e..0a3842151 100644 --- a/src/include/nccl_common.h +++ b/src/include/nccl_common.h @@ -7,6 +7,11 @@ #ifndef NCCL_DEBUG_H_ #define NCCL_DEBUG_H_ +// Workaround for libstdc++ trying to force public visibility of std:: symbols. We don't want to do that in libnccl.so. +#include +#undef _GLIBCXX_VISIBILITY +#define _GLIBCXX_VISIBILITY(V) + #include typedef enum { @@ -60,24 +65,11 @@ typedef enum { ncclFuncSendRecv = 5, ncclFuncSend = 6, ncclFuncRecv = 7, - ncclNumFuncs = 8 + ncclFuncAlltoAll = 8, + ncclFuncScatter = 9, + ncclFuncGather = 10, + ncclNumFuncs = 11 } ncclFunc_t; -#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*/PAT -#define NCCL_ALGO_UNDEF -1 -#define NCCL_ALGO_TREE 0 -#define NCCL_ALGO_RING 1 -#define NCCL_ALGO_COLLNET_DIRECT 2 -#define NCCL_ALGO_COLLNET_CHAIN 3 -#define NCCL_ALGO_NVLS 4 -#define NCCL_ALGO_NVLS_TREE 5 -#define NCCL_ALGO_PAT 6 - -#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 -#define NCCL_PROTO_UNDEF -1 -#define NCCL_PROTO_LL 0 -#define NCCL_PROTO_LL128 1 -#define NCCL_PROTO_SIMPLE 2 -#define NCCL_ALGO_PROTO_IGNORE -1.0 #endif diff --git a/src/include/nccl_device.h b/src/include/nccl_device.h new file mode 100644 index 000000000..88b2531d1 --- /dev/null +++ b/src/include/nccl_device.h @@ -0,0 +1,15 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "nccl_device/impl/comm__funcs.h" +#include "nccl_device/coop.h" +#include "nccl_device/impl/core__funcs.h" +#include "nccl_device/impl/ll_a2a__funcs.h" +#include "nccl_device/impl/mem_barrier__funcs.h" +//#include "nccl_device/net_barrier__funcs.h" +//#include "nccl_device/net_scratch_a2a__funcs.h" +//#include "nccl_device/barrier__funcs.h" +#include "nccl_device/impl/ptr__funcs.h" diff --git a/src/include/nccl_device/README.md b/src/include/nccl_device/README.md new file mode 100644 index 000000000..bf1728d47 --- /dev/null +++ b/src/include/nccl_device/README.md @@ -0,0 +1,32 @@ +This directory has been structured to make it easy for user to read the headers to learn the API. The files adjacent +to this README are meant for humans. They contain the essential declarations like which types exist and function prototypes and comments +indicating the contract/usage. Everything else goes into the "impl/" subdirectory. Most modules are stratified into three layers: + +1) "foo.h" Public API declarations. +2) "impl/foo__types.h" struct definitions. Has #include of layer 1. +3) "impl/foo_funcs.h" inline functions. Has #include of layer 2. + +The include dependencies should be acyclic for layers 1 and 2 since order matters for declarations and types. Layer 3 though +can freely have cycles amongst itself ("impl/foo__funcs.h" and "impl/bar__funcs.h" can mutually include each other) since +functions can be defined in any order once declared. + +Translation units should just include "nccl_device.h" to ensure they get all the "impl/foo__funcs.h". But if a translation unit wants +to be more specific as to which module it pulls in it should include "impl/foo__funcs.h". + +One of the nasty reasons this was required is because of C++ defaulted function parameters: + +``` +// +++ in foo.h +++ +struct Foo; // defined in some __types.h + +// +++ in "impl/foo__types.h" +++ +struct Foo { int x; }; + +// +++ in "bar.h" +++ +// Prototype function where default value is default construction of Foo. Since +// Foo would be incomplete if just including "foo.h" the compiler errors because +// it can't reason about the {}. +// I was able to solve this by including "impl/foo__types.h" instead. +#include "impl/foo__types.h" +void bar(Foo arg = {}); +``` diff --git a/src/include/nccl_device/comm.h b/src/include/nccl_device/comm.h new file mode 100644 index 000000000..d989ce1f6 --- /dev/null +++ b/src/include/nccl_device/comm.h @@ -0,0 +1,10 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_COMM_H_ +#define _NCCL_DEVICE_COMM_H_ +#include "core.h" +#endif diff --git a/src/include/nccl_device/coop.h b/src/include/nccl_device/coop.h new file mode 100644 index 000000000..9a8d4b0a8 --- /dev/null +++ b/src/include/nccl_device/coop.h @@ -0,0 +1,152 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_COOP_H_ +#define _NCCL_DEVICE_COOP_H_ +#include "utility.h" + +// ncclCoop[Foo]: NCCL's versions of CUDA's Cooperative Groups. They conform +// to just this subset of the CUDA API: +// int Coop::thread_rank(); +// int Coop::size(); +// int Coop::num_threads(); // same as size() +// void Coop::sync(); + +#if __CUDACC__ +template +struct ncclCoopTile { // An aligned pow2 set of threads within the warp. + static_assert(nccl::utility::isPow2(nThreadsPow2) && nThreadsPow2 <= 32, "Condition required"); + + NCCL_DEVICE_INLINE int thread_rank() const { + return nccl::utility::lane() % nThreadsPow2; + } + NCCL_DEVICE_INLINE constexpr int size() const { return nThreadsPow2; } + NCCL_DEVICE_INLINE constexpr int num_threads() const { return nThreadsPow2; } + + NCCL_DEVICE_INLINE uint32_t laneMask() const { + return (-1u>>(32-nThreadsPow2))<<(nccl::utility::lane() & -nThreadsPow2); + } + NCCL_DEVICE_INLINE void sync() { + __syncwarp(laneMask()); + } +}; +#endif + +#if __CUDACC__ +typedef ncclCoopTile<1> ncclCoopThread; +typedef ncclCoopTile<32> ncclCoopWarp; +#endif + +#if __CUDACC__ +struct ncclCoopLanes { // Some lanes of this warp. + uint32_t lmask; + + NCCL_DEVICE_INLINE constexpr ncclCoopLanes(uint32_t lmask=-1u): lmask(lmask) {} + + NCCL_DEVICE_INLINE int thread_rank() const { + return __popc(lmask & nccl::utility::lanemask_lt()); + } + NCCL_DEVICE_INLINE int size() const { + return __popc(lmask); + } + NCCL_DEVICE_INLINE int num_threads() const { + return __popc(lmask); + } + NCCL_DEVICE_INLINE void sync() { + __syncwarp(lmask); + } +}; +#endif + +#if __CUDACC__ +// A set of consecutive warps that the user has also supplied with a unique +// id from [0..15]. It is an error for two different warp spans with the same +// id to be in a collective concurrently. +struct ncclCoopWarpSpan { + uint32_t warp0:8, nWarps:8, id:8; + + NCCL_DEVICE_INLINE constexpr ncclCoopWarpSpan(int warp0, int nWarps, int id): + warp0(warp0), nWarps(nWarps), id(id) { + } + + NCCL_DEVICE_INLINE int thread_rank() const { + return threadIdx.x - 32*warp0; + } + NCCL_DEVICE_INLINE int size() const { + return 32*nWarps; + } + NCCL_DEVICE_INLINE int num_threads() const { + return 32*nWarps; + } + + NCCL_DEVICE_INLINE void sync() { + //asm volatile("barrier.sync %0, %1;" :: "r"(1+id), "r"(32*nWarps) : "memory"); + __barrier_sync_count(1+id, 32*nWarps); + } +}; +#endif + +#if __CUDACC__ +struct ncclCoopCta { + NCCL_DEVICE_INLINE int thread_rank() const { return threadIdx.x; } + NCCL_DEVICE_INLINE int size() const { return blockDim.x; } + NCCL_DEVICE_INLINE int num_threads() const { return blockDim.x; } + NCCL_DEVICE_INLINE void sync() { __syncthreads(); } +}; +#endif + +#if __CUDACC__ +template +NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopTile coop) { + return coop.laneMask(); +} +NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopLanes coop) { + return coop.lmask; +} +NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopWarpSpan coop) { + return -1u; +} +NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopCta coop) { + return -1u; +} +#endif + +#if __CUDACC__ +// ncclCoopIsThread: +// At compile time do we know the given coop is a single thread only. +template +NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopTile) { + return nThreads == 1; +} +NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopLanes) { return false; } +NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopWarpSpan) { return false; } +NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopCta) { return false; } +#endif + +#if __CUDACC__ +// Pick threads of our warp that are safe to use collectively. +NCCL_DEVICE_INLINE ncclCoopLanes ncclCoopCoalesced() { + return ncclCoopLanes{__activemask()}; +} +#endif + +#if __CUDACC__ +// Pick threads of our warp that are safe to use collectively given that this +// is a collective on the provided cooperative group. +template +NCCL_DEVICE_INLINE ncclCoopTile<32> ncclCoopCoalesced(Coop) { + return ncclCoopTile<32>(); +} +NCCL_DEVICE_INLINE ncclCoopLanes ncclCoopCoalesced(ncclCoopLanes coop) { + return coop; +} +template +NCCL_DEVICE_INLINE ncclCoopTile ncclCoopCoalesced(ncclCoopTile coop) { + return coop; +} +#endif + +#endif diff --git a/src/include/nccl_device/core.h b/src/include/nccl_device/core.h new file mode 100644 index 000000000..dd41d6925 --- /dev/null +++ b/src/include/nccl_device/core.h @@ -0,0 +1,150 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_CORE_H_ +#define _NCCL_DEVICE_CORE_H_ +#include +#include "coop.h" +#include "utility.h" + +struct ncclDevComm; +typedef struct ncclDevComm ncclDevComm_t; + +struct ncclTeam; +typedef struct ncclTeam ncclTeam_t; + +// typedef struct ncclWindow_vidmem* ncclWindow_t; // in nccl.h + +struct ncclMultimemHandle; +typedef struct ncclMultimemHandle ncclMultimemHandle_t; + +typedef uint32_t ncclDevResourceHandle; +typedef ncclDevResourceHandle ncclDevResourceHandle_t; + +struct ncclLsaBarrierHandle; +typedef struct ncclLsaBarrierHandle ncclLsaBarrierHandle_t; + +struct ncclLLA2AHandle; +typedef struct ncclLLA2AHandle ncclLLA2AHandle_t; + +struct ncclTeam { + int nRanks, rank, stride; +}; + +#if __cplusplus +template struct ncclSymPtr; +#endif + +#if __cplusplus +struct ncclTeamTagWorld {}; +struct ncclTeamTagLsa {}; +struct ncclTeamTagRail {}; +#endif + +struct ncclDevCommRequirements; +typedef struct ncclDevCommRequirements ncclDevCommRequirements_t; + +struct ncclDevResourceRequirements; +typedef struct ncclDevResourceRequirements ncclDevResourceRequirements_t; + +struct ncclTeamRequirements; +typedef struct ncclTeamRequirements ncclTeamRequirements_t; + +struct ncclDevCommRequirements { + ncclDevResourceRequirements_t* resourceRequirementsList; + ncclTeamRequirements_t* teamRequirementsList; + + bool lsaMultimem; // Enable multimem on lsa team + + int lsaBarrierCount; +}; + +struct ncclDevResourceRequirements { + ncclDevResourceRequirements_t* next; + size_t bufferSize, bufferAlign; + ncclDevResourceHandle_t* outBufferHandle; // If non-null, target assigned during ncclDevCommCreate. +}; + +struct ncclTeamRequirements { + ncclTeamRequirements_t* next; + ncclTeam_t team; + bool multimem; + ncclMultimemHandle_t* outMultimemHandle; // If non-null, target assigned during ncclDevCommCreate. +}; + +NCCL_EXTERN_C __host__ ncclResult_t ncclDevCommCreate(ncclComm_t, ncclDevCommRequirements_t const*, ncclDevComm_t* outDevComm); +NCCL_EXTERN_C __host__ ncclResult_t ncclDevCommDestroy(ncclComm_t, ncclDevComm_t const* devComm); + +//////////////////////////////////////////////////////////////////////////////// +// Team API: + +#if __cplusplus +NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamWorld(ncclDevComm const&); +#endif +NCCL_EXTERN_C __host__ ncclTeam_t ncclTeamWorld(ncclComm_t); + +#if __cplusplus +NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamLsa(ncclDevComm const&); +#endif +NCCL_EXTERN_C __host__ ncclTeam_t ncclTeamLsa(ncclComm_t); + +NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE bool ncclTeamRankIsMember(ncclTeam_t a, ncclTeam_t b, int bPeer); +NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE int ncclTeamRankToTeam(ncclTeam_t a, ncclTeam_t b, int bPeer); + +#if __cplusplus +NCCL_HOST_DEVICE_INLINE int ncclTeamRankToWorld(ncclDevComm const&, ncclTeam, int rank); +#endif +NCCL_EXTERN_C __host__ int ncclTeamRankToWorld(ncclComm_t, ncclTeam_t, int rank); + +#if __cplusplus +NCCL_HOST_DEVICE_INLINE int ncclTeamRankToLsa(ncclDevComm const&, ncclTeam, int rank); +#endif +NCCL_EXTERN_C __host__ int ncclTeamRankToLsa(ncclComm_t, ncclTeam_t, int rank); + +NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE ncclTeam_t ncclTeamInnerFactor(ncclTeam_t parent, int innerSize); +NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE ncclTeam_t ncclTeamOuterFactor(ncclTeam_t parent, int innerSize); + +// Interpret each team as a set of ranks. This function assumes that `subset` +// is a subset of `parent`. Thus the number of ranks in the set difference of +// `parent` minus `subset` is `super.nRanks - subset.nRanks`. Given `index` this +// function returns the index'th element of `parent` minus `subset`. +NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE int ncclTeamRankInDifference(ncclTeam_t parent, ncclTeam_t subset, int index); + +// Equivalent to ncclTeamOuterFactor of lsa team. +#if __cplusplus +NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamRail(ncclDevComm const&); +#endif +NCCL_EXTERN_C __host__ ncclTeam_t ncclTeamRail(ncclComm_t); + +// Get offset of resource buffer within `comm.resourceWindow`. +NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE size_t ncclGetResourceBufferOffset(ncclDevResourceHandle_t); + +#if __CUDACC__ +NCCL_DEVICE_INLINE ncclSymPtr ncclGetResourceBuffer(ncclDevComm const&, ncclDevResourceHandle); +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Window API: + +#if __CUDACC__ +NCCL_DEVICE_INLINE void* ncclGetLocalPointer(ncclWindow_t w, size_t offset); +NCCL_DEVICE_INLINE void* ncclGetLsaPointer(ncclWindow_t w, size_t offset, int peer); +NCCL_DEVICE_INLINE void* ncclGetPeerPointer(ncclWindow_t w, size_t offset, int peer); +NCCL_DEVICE_INLINE void* ncclGetPeerPointer(ncclWindow_t w, size_t offset, ncclTeam tm, int peer); +NCCL_DEVICE_INLINE void* ncclGetMultimemPointer(ncclWindow_t w, size_t offset, ncclMultimemHandle mmHandle); +NCCL_DEVICE_INLINE void* ncclGetLsaMultimemPointer(ncclWindow_t w, size_t offset, ncclDevComm const&); +#endif + +#if __CUDACC__ +// Convenience for combining ncclGet***Pointer() with resource handle. +NCCL_DEVICE_INLINE void* ncclGetResourceBufferLocalPointer(ncclDevComm const&, ncclDevResourceHandle); +NCCL_DEVICE_INLINE void* ncclGetResourceBufferLsaPointer(ncclDevComm const&, ncclDevResourceHandle, int peer); +NCCL_DEVICE_INLINE void* ncclGetResourceBufferPeerPointer(ncclDevComm const&, ncclDevResourceHandle, ncclTeam, int peer); +NCCL_DEVICE_INLINE void* ncclGetResourceBufferMultimemPointer(ncclDevComm const&, ncclDevResourceHandle, ncclMultimemHandle); +NCCL_DEVICE_INLINE void* ncclGetResourceBufferLsaMultimemPointer(ncclDevComm const&, ncclDevResourceHandle); +#endif + +#endif // _NCCL_DEVICE_CORE_H_ diff --git a/src/include/nccl_device/impl/comm__funcs.h b/src/include/nccl_device/impl/comm__funcs.h new file mode 100644 index 000000000..0bfe90c91 --- /dev/null +++ b/src/include/nccl_device/impl/comm__funcs.h @@ -0,0 +1,10 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_COMM__FUNCS_H_ +#define _NCCL_DEVICE_COMM__FUNCS_H_ +#include "comm__types.h" +#endif // _NCCL_DEVICE_COMM__FUNCS_H_ diff --git a/src/include/nccl_device/impl/comm__types.h b/src/include/nccl_device/impl/comm__types.h new file mode 100644 index 000000000..680d7055b --- /dev/null +++ b/src/include/nccl_device/impl/comm__types.h @@ -0,0 +1,40 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_COMM__TYPES_H_ +#define _NCCL_DEVICE_COMM__TYPES_H_ +#include "../comm.h" +#include "core__types.h" +#include "mem_barrier__types.h" +#include "ll_a2a__types.h" + +struct ncclDevCommWindowTable; +#if __cplusplus +struct ncclDevCommWindowTable { + struct Entry { + uintptr_t base, size; + ncclWindow_t window; + } entries[32]; + struct ncclDevCommWindowTable* next; +}; +#endif + +struct ncclDevComm { + int rank, nRanks; + uint32_t nRanks_rcp32; + int lsaRank, lsaSize; + uint32_t lsaSize_rcp32; + + struct ncclDevCommWindowTable* windowTable; + + ncclWindow_t resourceWindow; + struct ncclWindow_vidmem resourceWindow_inlined; + + ncclMultimemHandle_t lsaMultimem; + ncclLsaBarrierHandle_t lsaBarrier; +}; + +#endif // _NCCL_DEVICE_COMM__TYPES_H_ diff --git a/src/include/nccl_device/impl/core__funcs.h b/src/include/nccl_device/impl/core__funcs.h new file mode 100644 index 000000000..1087cd289 --- /dev/null +++ b/src/include/nccl_device/impl/core__funcs.h @@ -0,0 +1,210 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_CORE__FUNCS_H_ +#define _NCCL_DEVICE_CORE__FUNCS_H_ +#include "core__types.h" +#include "comm__types.h" +#include "ptr__types.h" + +#if __cplusplus +NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamWorld(ncclDevComm const &comm) { + ncclTeam ans; + ans.nRanks = comm.nRanks; + ans.rank = comm.rank; + ans.stride = 1; + return ans; +} +#endif + +#if __cplusplus +NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamLsa(ncclDevComm const &comm) { + ncclTeam ans; + ans.nRanks = comm.lsaSize; + ans.rank = comm.lsaRank; + ans.stride = 1; + return ans; +} +#endif + +#if __cplusplus +NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamRail(ncclDevComm const& comm) { + ncclTeam ans; + ans.nRanks = nccl::utility::idivFast32(comm.nRanks, comm.lsaSize, comm.lsaSize_rcp32); + ans.rank = nccl::utility::idivFast32(comm.rank, comm.lsaSize, comm.lsaSize_rcp32); + ans.stride = comm.lsaSize; + return ans; +} +#endif + +NCCL_HOST_DEVICE_INLINE bool ncclTeamRankIsMember(ncclTeam_t a, ncclTeam_t b, int brank) { + int wrank = (brank - b.rank)*b.stride; + uint32_t adelta = wrank/a.stride; + uint32_t amod = wrank%a.stride; + int arank = a.rank + adelta; + return 0 <= arank && arank < a.nRanks && amod == 0; +} + +NCCL_HOST_DEVICE_INLINE int ncclTeamRankToTeam(ncclTeam_t a, ncclTeam_t b, int brank) { + int wrank = (brank - b.rank)*b.stride; + uint32_t adelta = wrank/a.stride; + //uint32_t amod = wrank%a.stride; + int arank = a.rank + adelta; + return arank; +} + +#if __cplusplus +NCCL_HOST_DEVICE_INLINE int ncclTeamRankToWorld(ncclDevComm const& comm, ncclTeam tm, int rank) { + return comm.rank + (rank - tm.rank)*tm.stride; +} +#endif + +#if __cplusplus +NCCL_HOST_DEVICE_INLINE int ncclTeamRankToLsa(ncclDevComm const& comm, ncclTeam tm, int rank) { + return comm.lsaRank + (rank - tm.rank)*tm.stride; +} +#endif + +NCCL_HOST_DEVICE_INLINE ncclTeam_t ncclTeamInnerFactor(ncclTeam_t parent, int innerSize) { + ncclTeam_t ans; + ans.nRanks = innerSize; + ans.rank = parent.rank%innerSize; + ans.stride = parent.stride; + return ans; +} + +NCCL_HOST_DEVICE_INLINE ncclTeam_t ncclTeamOuterFactor(ncclTeam_t parent, int innerSize) { + ncclTeam_t ans; + ans.nRanks = parent.nRanks/innerSize; + ans.rank = parent.rank/innerSize; + ans.stride = parent.stride*innerSize; + return ans; +} + +NCCL_HOST_DEVICE_INLINE int ncclTeamRankInDifference(ncclTeam_t parent, ncclTeam_t subset, int index) { + int stride = subset.stride/parent.stride; + int below = parent.rank - subset.rank*stride; + if (stride < 0) { + stride = -stride; + below -= (subset.nRanks-1)*stride; + } + if (index < below) { + return index; + } else if (index-below < (subset.nRanks-1)*(stride-1)) { + return below + 1 + ((index-below)/(stride-1))*stride + (index-below)%(stride-1); + } else { + return below + 1 + (subset.nRanks-1)*stride + (index - below - (subset.nRanks-1)*(stride-1)); + } +} + +#if __CUDACC__ +NCCL_DEVICE_INLINE void* ncclGetLocalPointer(ncclWindow_t w, size_t offset) { + char* base = nccl::utility::loadConst(&w->lsaFlatBase); + uint32_t stride4G = nccl::utility::loadConst(&w->stride4G); + int i = nccl::utility::loadConst(&w->lsaRank); + return (void*)(nccl::utility::add4G(base, i*stride4G) + offset); +} +#endif + +#if __CUDACC__ +NCCL_DEVICE_INLINE void* ncclGetLsaPointer(ncclWindow_t w, size_t offset, int peer) { + char* base = nccl::utility::loadConst(&w->lsaFlatBase); + uint32_t stride4G = nccl::utility::loadConst(&w->stride4G); + int i = peer; + return (void*)(nccl::utility::add4G(base, i*stride4G) + offset); +} +#endif + +#if __CUDACC__ +NCCL_DEVICE_INLINE void* ncclGetPeerPointer(ncclWindow_t w, size_t offset, int peer) { + char* base = nccl::utility::loadConst(&w->lsaFlatBase); + uint32_t stride4G = nccl::utility::loadConst(&w->stride4G); + int worldRank = nccl::utility::loadConst(&w->worldRank); + int lsaRank = nccl::utility::loadConst(&w->lsaRank); + int i = lsaRank + (peer - worldRank); + return (void*)(nccl::utility::add4G(base, i*stride4G) + offset); +} +#endif + +#if __CUDACC__ +NCCL_DEVICE_INLINE void* ncclGetPeerPointer(ncclWindow_t w, size_t offset, ncclTeam tm, int peer) { + char* base = nccl::utility::loadConst(&w->lsaFlatBase); + uint32_t stride4G = nccl::utility::loadConst(&w->stride4G); + int lsaRank = nccl::utility::loadConst(&w->lsaRank); + int i = lsaRank + (peer - tm.rank)*tm.stride; + return (void*)(nccl::utility::add4G(base, i*stride4G) + offset); +} +#endif + +#if __CUDACC__ +NCCL_DEVICE_INLINE void* ncclGetMultimemPointer(ncclWindow_t w, size_t offset, ncclMultimemHandle mm) { + void* ptr = mm.mcBasePtr; + ptr = reinterpret_cast(ptr) + nccl::utility::loadConst(&w->mcOffset4K); + return (void*)((char*)ptr + offset); +} +#endif + +#if __CUDACC__ +NCCL_DEVICE_INLINE void* ncclGetLsaMultimemPointer(ncclWindow_t w, size_t offset, ncclDevComm const& comm) { + return ncclGetMultimemPointer(w, offset, comm.lsaMultimem); +} +#endif + +NCCL_HOST_DEVICE_INLINE size_t ncclGetResourceBufferOffset(ncclDevResourceHandle_t h) { + return ((size_t)h)*128; +} + +#if __CUDACC__ +NCCL_DEVICE_INLINE void* ncclGetResourceBufferLocalPointer(ncclDevComm const& comm, ncclDevResourceHandle h) { + void* lsaFlatBase = comm.resourceWindow_inlined.lsaFlatBase; + uint32_t stride4G = comm.resourceWindow_inlined.stride4G; + void* local = nccl::utility::add4G(lsaFlatBase, comm.lsaRank*stride4G); + return (void*)(reinterpret_cast(local) + h); +} +#endif + +#if __CUDACC__ +NCCL_DEVICE_INLINE void* ncclGetResourceBufferLsaPointer(ncclDevComm const& comm, ncclDevResourceHandle h, int peer) { + int r = peer; + void* lsaFlatBase = comm.resourceWindow_inlined.lsaFlatBase; + uint32_t stride4G = comm.resourceWindow_inlined.stride4G; + void* local = nccl::utility::add4G(lsaFlatBase, r*stride4G); + return (void*)(reinterpret_cast(local) + h); +} +#endif + +#if __CUDACC__ +NCCL_DEVICE_INLINE void* ncclGetResourceBufferPeerPointer(ncclDevComm const& comm, ncclDevResourceHandle h, ncclTeam team, int peer) { + int r = comm.lsaRank + (peer - team.rank)*team.stride; + void* lsaFlatBase = comm.resourceWindow_inlined.lsaFlatBase; + uint32_t stride4G = comm.resourceWindow_inlined.stride4G; + void* local = nccl::utility::add4G(lsaFlatBase, r*stride4G); + return (void*)(reinterpret_cast(local) + h); +} +#endif + +#if __CUDACC__ +NCCL_DEVICE_INLINE void* ncclGetResourceBufferMultimemPointer(ncclDevComm const& comm, ncclDevResourceHandle h, ncclMultimemHandle mm) { + void* ptr = mm.mcBasePtr; + ptr = reinterpret_cast(ptr) + comm.resourceWindow_inlined.mcOffset4K; + ptr = reinterpret_cast(ptr) + h; + return ptr; +} +#endif + +#if __CUDACC__ +NCCL_DEVICE_INLINE void* ncclGetResourceBufferLsaMultimemPointer(ncclDevComm const& comm, ncclDevResourceHandle h) { + return ncclGetResourceBufferMultimemPointer(comm, h, comm.lsaMultimem); +} +#endif + +#if __CUDACC__ +NCCL_DEVICE_INLINE ncclSymPtr ncclGetResourceBuffer(ncclDevComm const& comm, ncclDevResourceHandle h) { + return ncclSymPtr(comm.resourceWindow, size_t(h)*128); +} +#endif + +#endif diff --git a/src/include/nccl_device/impl/core__types.h b/src/include/nccl_device/impl/core__types.h new file mode 100644 index 000000000..d2d1350b1 --- /dev/null +++ b/src/include/nccl_device/impl/core__types.h @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_CORE__TYPES_H_ +#define _NCCL_DEVICE_CORE__TYPES_H_ +#include "../core.h" + +// nccl.h has: typedef ncclWindow_vidmem* ncclWindow_t; +struct ncclWindow_vidmem { + void* winHost; + //ncclGinWindow_t ginWin; + char* lsaFlatBase; // pointer to first byte for rank 0 of lsa team + int lsaRank; + int worldRank; + uint32_t stride4G; + uint32_t mcOffset4K; +}; + +struct ncclMultimemHandle { + void* mcBasePtr; +}; + +#endif diff --git a/src/include/nccl_device/impl/ll_a2a__funcs.h b/src/include/nccl_device/impl/ll_a2a__funcs.h new file mode 100644 index 000000000..39bdf7a29 --- /dev/null +++ b/src/include/nccl_device/impl/ll_a2a__funcs.h @@ -0,0 +1,229 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_LL_A2A__FUNCS_H_ +#define _NCCL_DEVICE_LL_A2A__FUNCS_H_ +#include "ll_a2a__types.h" +#include "comm__types.h" +#include "../utility.h" + +#if __CUDACC__ +template +NCCL_DEVICE_INLINE ncclLLA2ASession::ncclLLA2ASession( + Coop coop, ncclDevComm const& comm, ncclTeam team, + ncclLLA2AHandle handle, uint32_t block, int maxElts, + bool multimem, ncclMultimemHandle mmHandle + ): + ncclLLA2ASession_internal{ + coop, comm, team, handle, (int)block, /*pitch=*/maxElts, + multimem, mmHandle, /*epoch=*/0, /*slotsOffset=*/0 + } { + uint4* line = (uint4*)ncclGetResourceBufferLocalPointer(comm, handle.bufHandle); + line += block*(1 + 2*handle.nSlots); + this->epoch = line->x + 2; + this->slotsOffset = this->calcSlotOffset(); +} +#endif + +#if __CUDACC__ +template +NCCL_DEVICE_INLINE ncclLLA2ASession::~ncclLLA2ASession() { + uint4* line = (uint4*)ncclGetResourceBufferLocalPointer(this->comm, this->handle.bufHandle); + line += this->block*(1 + 2*this->handle.nSlots); + if (this->coop.thread_rank() == 0) line->x = this->epoch - 2; + this->coop.sync(); +} +#endif + +#if __CUDACC__ +template +template +NCCL_DEVICE_INLINE void ncclLLA2ASession::send(int peer, int elt, T data) { + using nccl::utility::divUp; + union { T tmp; uint32_t u32[divUp(sizeof(T), 8)][2]; }; + tmp = data; + uint4* buf = (uint4*)ncclGetResourceBufferPeerPointer(this->comm, this->handle.bufHandle, this->team, peer); + buf += this->slotsOffset + elt; + #pragma unroll + for (int u=0; u < divUp(sizeof(T), 8); u++) { + asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: + "l"(buf + u*this->pitch), + "r"(u32[u][0]), "r"(u32[u][1]), "r"(this->epoch) + ); + } +} +#endif + +#if __CUDACC__ +template +template +NCCL_DEVICE_INLINE void ncclLLA2ASession::bcast(int elt, T data) { + using nccl::utility::divUp; + if (this->multimem) { + union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; }; + tmp = data; + uint4* bufmc = (uint4*)ncclGetResourceBufferMultimemPointer(this->comm, this->handle.bufHandle, this->mmHandle); + bufmc += this->slotsOffset + elt; + #pragma unroll + for (int u=0; u < divUp(sizeof(T), 8); u++) { + asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: + "l"(bufmc + this->pitch*u), + "r"(u32[u][0]), "r"(u32[u][1]), "r"(this->epoch) + ); + } + } else { + union { T tmp; uint32_t u32[divUp(sizeof(T), 8)][2]; }; + tmp = data; + int dr = 0; + int r = this->team.rank; + #pragma unroll 1 + for (; dr+8 <= this->team.nRanks; dr += 8) { + #pragma unroll + for (int ur=0; ur < 8; ur++) { + uint4* buf = (uint4*)ncclGetResourceBufferPeerPointer(this->comm, this->handle.bufHandle, this->team, r); + buf += this->slotsOffset + elt; + #pragma unroll + for (int u=0; u < divUp(sizeof(T),8); u++) { + asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: + "l"(buf + u*this->pitch), + "r"(u32[u][0]), "r"(u32[u][1]), "r"(this->epoch) + ); + } + r += 1; + if (r == this->team.nRanks) r = 0; + } + } + #pragma unroll + for (int ur=0; ur < 8; ur++, dr++) { + if (dr == this->team.nRanks) break; + uint4* buf = (uint4*)ncclGetResourceBufferPeerPointer(this->comm, this->handle.bufHandle, this->team, r); + buf += this->slotsOffset + elt; + #pragma unroll + for (int u=0; u < divUp(sizeof(T),8); u++) { + asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: + "l"(buf + u*this->pitch), + "r"(u32[u][0]), "r"(u32[u][1]), "r"(this->epoch) + ); + } + r += 1; + if (r == this->team.nRanks) r = 0; + } + } +} +#endif + +#if __CUDACC__ +template +template +NCCL_DEVICE_INLINE T ncclLLA2ASession::recv(int elt) { + T ret[1]; + this->template recvUnrolled(elt, 1, 0, ret); + return ret[0]; +} +#endif + +#if __CUDACC__ +template +template +NCCL_DEVICE_INLINE void ncclLLA2ASession::recvUnrolled(int eltStart, int eltCount, int eltStride, T(&elts)[MaxEltCount]) { + using nccl::utility::divUp; + uint4* buf = (uint4*)ncclGetResourceBufferLocalPointer(this->comm, this->handle.bufHandle); + buf += this->slotsOffset + eltStart; + + uint4 tmp[MaxEltCount][divUp(sizeof(T), 8)]; + #pragma unroll 1 + while (true) { + #pragma unroll + for (int u=0; u < MaxEltCount; u++) { + if (u < MinEltCount || u < eltCount) { + #pragma unroll + for (int v=0; v < divUp(sizeof(T), 8); v++) { + asm volatile("ld.volatile.v4.u32 {%0,%1,%2,%3},[%4];" + : "=r"(tmp[u][v].x), "=r"(tmp[u][v].y), "=r"(tmp[u][v].z), "=r"(tmp[u][v].w) + : "l"(buf + u*eltStride + v*this->pitch)); + } + } + } + bool okAll = true; + #pragma unroll + for (int u=0; u < MaxEltCount; u++) { + #pragma unroll + for (int v=0; v < divUp(sizeof(T), 8); v++) { + if (u < MinEltCount || u < eltCount) { + bool ok = tmp[u][v].y == this->epoch && + tmp[u][v].w == this->epoch; + okAll &= ok; + } + } + } + if (__builtin_expect(okAll, true)) break; + } + + #pragma unroll + for (int u=0; u < MaxEltCount; u++) { + if (MinEltCount <= u && u == eltCount) break; + union { T val; uint32_t u32[divUp(sizeof(T), 8)][2]; }; + #pragma unroll + for (int v=0; v < divUp(sizeof(T), 8); v++) { + u32[v][0] = tmp[u][v].x; + u32[v][1] = tmp[u][v].z; + } + elts[u] = val; + } +} +#endif + +#if __CUDACC__ +template +template +NCCL_DEVICE_INLINE auto ncclLLA2ASession::recvReduce( + int eltStart, int eltCount, int eltStride, EltToAcc eltToAcc, Reduce reduce + ) -> decltype(eltToAcc(nccl::utility::declval())) { + using Acc = decltype(eltToAcc(nccl::utility::declval())); + Acc acc; + int i = 0; + #pragma unroll 1 + for (; i+Unroll <= eltCount; i += Unroll) { + Elt got[Unroll]; + this->template recvUnrolled(eltStart + i*eltStride, Unroll, eltStride, got); + Acc acc0 = eltToAcc(got[0]); + acc = i==0 ? acc0 : reduce(acc, acc0); + #pragma unroll + for (int j=1; j < Unroll; j++) acc = reduce(acc, eltToAcc(got[j])); + } + if (i < eltCount) { + Elt got[Unroll]; + this->template recvUnrolled(eltStart + i*eltStride, eltCount-i, eltStride, got); + Acc acc0 = eltToAcc(got[0]); + acc = i==0 ? acc0 : reduce(acc, acc0); + #pragma unroll + for (int j=1; j < Unroll-1; j++) { + if (i+j < eltCount) acc = reduce(acc, eltToAcc(got[j])); + } + } + return acc; +} +#endif + +#if __CUDACC__ +template +NCCL_DEVICE_INLINE void ncclLLA2ASession::endEpoch(Coop) { + if (__builtin_expect(this->epoch >= -2u, false)) { + this->coop.sync(); + uint4* buf = (uint4*)ncclGetResourceBufferLocalPointer(this->comm, this->handle.bufHandle); + buf += this->slotsOffset; + #pragma unroll 4 + for (int i=this->coop.thread_rank(); i < this->handle.nSlots; i += this->coop.size()) { + buf[i] = uint4{0, 0, 0, 0}; + } + } + this->coop.sync(); + this->epoch += (this->epoch == -1u) ? 3 : 1; + this->slotsOffset = this->calcSlotOffset(); +} +#endif + +#endif // _NCCL_DEVICE_LL_A2A__FUNCS_H_ diff --git a/src/include/nccl_device/impl/ll_a2a__types.h b/src/include/nccl_device/impl/ll_a2a__types.h new file mode 100644 index 000000000..501777acf --- /dev/null +++ b/src/include/nccl_device/impl/ll_a2a__types.h @@ -0,0 +1,37 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_LL_A2A__TYPES_H_ +#define _NCCL_DEVICE_LL_A2A__TYPES_H_ +#include "../ll_a2a.h" +#include "core__types.h" + +struct ncclLLA2AHandle { + ncclDevResourceHandle_t bufHandle; + uint32_t nSlots; +}; + +#if __CUDACC__ +template +struct ncclLLA2ASession_internal { + Coop coop; + ncclDevComm const& comm; + ncclTeam team; + ncclLLA2AHandle handle; + int block; + int pitch; + bool multimem; + ncclMultimemHandle mmHandle; + uint32_t epoch; + uint32_t slotsOffset; + + NCCL_DEVICE_INLINE uint32_t calcSlotOffset() const { + return block*(1 + 2*handle.nSlots) + 1 + (epoch & 1)*handle.nSlots; + } +}; +#endif + +#endif // _NCCL_DEVICE_LL_A2A__TYPES_H_ diff --git a/src/include/nccl_device/impl/mem_barrier__funcs.h b/src/include/nccl_device/impl/mem_barrier__funcs.h new file mode 100644 index 000000000..86a5d0fbc --- /dev/null +++ b/src/include/nccl_device/impl/mem_barrier__funcs.h @@ -0,0 +1,126 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_MEM_BARRIER__FUNCS_H_ +#define _NCCL_DEVICE_MEM_BARRIER__FUNCS_H_ +#include "mem_barrier__types.h" +#include "comm__types.h" + +#if __CUDACC__ +template +NCCL_DEVICE_INLINE ncclLsaBarrierSession::ncclLsaBarrierSession( + Coop coop, ncclDevComm const& comm, ncclTeam team, + ncclLsaBarrierHandle handle, uint32_t index, + bool multimem, ncclMultimemHandle mmHandle + ): + ncclLsaBarrierSession_internal{ + coop, comm, team, handle, (int)index, +#if CUDART_VERSION >= 12060 + multimem, +#else // WAR for an issue with ptxas in CTK < 12.6 + /*multimem=*/false, +#endif + mmHandle, /*epoch=*/0 + } { + uint32_t* state = (uint32_t*)ncclGetResourceBufferLocalPointer(comm, handle.bufHandle); + this->epoch = state[(this->multimem ? 0 : 1)*this->handle.nBarriers + this->index]; +} +#endif + +#if __CUDACC__ +template +NCCL_DEVICE_INLINE ncclLsaBarrierSession::ncclLsaBarrierSession( + Coop coop, ncclDevComm const& comm, ncclTeamTagLsa, uint32_t index, bool multimem + ): ncclLsaBarrierSession( + coop, comm, ncclTeamLsa(comm), comm.lsaBarrier, index, multimem, comm.lsaMultimem + ) { +} +#endif + +#if __CUDACC__ +template +NCCL_DEVICE_INLINE ncclLsaBarrierSession::~ncclLsaBarrierSession() { + uint32_t* state = (uint32_t*)ncclGetResourceBufferLocalPointer(this->comm, this->handle.bufHandle); + if (this->coop.thread_rank() == 0) { +#if __CUDA_ARCH__ == 1200 && CUDART_VERSION < 13000 + // WAR for a compiler issue with CTK < 13.0 + if (this->index == 0) + state[(this->multimem ? 0 : 1)*this->handle.nBarriers] = this->epoch; + else +#endif + state[(this->multimem ? 0 : 1)*this->handle.nBarriers + this->index] = this->epoch; + } + this->coop.sync(); +} +#endif + +#if __CUDACC__ +template +NCCL_DEVICE_INLINE void ncclLsaBarrierSession::arrive(Coop, cuda::memory_order order) { + this->coop.sync(); + if (this->multimem) { + #if __CUDA_ARCH__ >= 900 + if (this->coop.thread_rank() == 0) { + uint32_t* inbox = this->mcInbox(/*multimem=*/true); + if (nccl::utility::releaseOrderOf(order) != cuda::memory_order_relaxed) { + asm volatile("multimem.red.release.sys.add.u32 [%0],1;" :: "l"(inbox)); + } else { + asm volatile("multimem.red.relaxed.sys.add.u32 [%0],1;" :: "l"(inbox)); + } + } + #endif + } else { + #pragma unroll 1 + for (int i = this->coop.thread_rank(); i < this->team.nRanks-1; i += this->coop.size()) { + int peer = i + (this->team.rank <= i ? 1 : 0); + cuda::atomic_ref inbox(*this->ucInbox(peer, this->team.rank)); + inbox.store(this->epoch+1, nccl::utility::releaseOrderOf(order)); + } + } +} +#endif + +#if __CUDACC__ +template +NCCL_DEVICE_INLINE void ncclLsaBarrierSession::wait(Coop, cuda::memory_order order) { + if (this->multimem) { + #if __CUDA_ARCH__ >= 900 + if (this->coop.thread_rank() == 0) { + cuda::atomic_ref inbox(*this->mcInbox(/*multimem=*/false)); + #pragma unroll 1 + while (true) { + uint32_t got = inbox.load(nccl::utility::acquireOrderOf(order)); + if (got - (this->epoch + this->team.nRanks) <= uint32_t(-1)>>1) break; + } + this->epoch += this->team.nRanks; + } + #endif + } else { + #pragma unroll 1 + for (int i = this->coop.thread_rank(); i < this->team.nRanks-1; i += this->coop.size()) { + int peer = i + (this->team.rank <= i ? 1 : 0); + cuda::atomic_ref inbox(*this->ucInbox(this->team.rank, peer)); + #pragma unroll 1 + while (true) { + uint32_t got = inbox.load(nccl::utility::acquireOrderOf(order)); + if (got - (this->epoch + 1) <= uint32_t(-1)>>1) break; + } + } + this->epoch += 1; + } + this->coop.sync(); +} +#endif + +#if __CUDACC__ +template +NCCL_DEVICE_INLINE void ncclLsaBarrierSession::sync(Coop coop, cuda::memory_order order) { + this->arrive(coop, order); + this->wait(coop, order); +} +#endif + +#endif // _NCCL_DEVICE_MEM_BARRIER__FUNCS_H_ diff --git a/src/include/nccl_device/impl/mem_barrier__types.h b/src/include/nccl_device/impl/mem_barrier__types.h new file mode 100644 index 000000000..8498cd6ba --- /dev/null +++ b/src/include/nccl_device/impl/mem_barrier__types.h @@ -0,0 +1,46 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_MEM_BARRIER__TYPES_H_ +#define _NCCL_DEVICE_MEM_BARRIER__TYPES_H_ +#include "../mem_barrier.h" +#include "core__types.h" + +struct ncclLsaBarrierHandle { + ncclDevResourceHandle_t bufHandle; + int nBarriers; +}; + +#if __CUDACC__ +template +struct ncclLsaBarrierSession_internal { + Coop coop; + ncclDevComm const& comm; + ncclTeam team; + ncclLsaBarrierHandle handle; + int index; + bool multimem; + ncclMultimemHandle mmHandle; + uint32_t epoch; + + NCCL_DEVICE_INLINE uint32_t* mcInbox(bool multimem) { + uint32_t* state; + if (multimem) { // multicast + state = (uint32_t*)ncclGetResourceBufferMultimemPointer(comm, handle.bufHandle, mmHandle); + } else { // unicast + state = (uint32_t*)ncclGetResourceBufferLocalPointer(comm, handle.bufHandle); + } + return state + 2*handle.nBarriers + index; + } + + NCCL_DEVICE_INLINE uint32_t* ucInbox(int owner, int peer) { + uint32_t* state = (uint32_t*)ncclGetResourceBufferPeerPointer(comm, handle.bufHandle, team, owner); + return state + 3*handle.nBarriers + index*team.nRanks + peer; + } +}; +#endif + +#endif // _NCCL_DEVICE_MEM_BARRIER__TYPES_H_ diff --git a/src/include/nccl_device/impl/ptr__funcs.h b/src/include/nccl_device/impl/ptr__funcs.h new file mode 100644 index 000000000..ef33634e4 --- /dev/null +++ b/src/include/nccl_device/impl/ptr__funcs.h @@ -0,0 +1,157 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_PTR__FUNCS_H_ +#define _NCCL_DEVICE_PTR__FUNCS_H_ +#include "ptr__types.h" +#include "core__funcs.h" +#include "comm__types.h" + +#if __cplusplus + +template +NCCL_HOST_DEVICE_INLINE constexpr ncclSymPtr::ncclSymPtr(ncclWindow_t window, size_t offset): + window(window), offset(offset) { +} + +template +template +NCCL_HOST_DEVICE_INLINE ncclSymPtr::operator ncclSymPtr() const { + return {window, offset}; +} + +template +NCCL_HOST_DEVICE_INLINE ncclSymPtr& ncclSymPtr::operator+=(int d) { + offset = reinterpret_cast(reinterpret_cast(offset) + d); + return *this; +} +template +NCCL_HOST_DEVICE_INLINE ncclSymPtr& ncclSymPtr::operator+=(unsigned int d) { + offset = reinterpret_cast(reinterpret_cast(offset) + d); + return *this; +} + +template +NCCL_HOST_DEVICE_INLINE ncclSymPtr& ncclSymPtr::operator+=(long d) { + offset = reinterpret_cast(reinterpret_cast(offset) + d); + return *this; +} +template +NCCL_HOST_DEVICE_INLINE ncclSymPtr& ncclSymPtr::operator+=(unsigned long d) { + offset = reinterpret_cast(reinterpret_cast(offset) + d); + return *this; +} + +template +NCCL_HOST_DEVICE_INLINE ncclSymPtr& ncclSymPtr::operator+=(long long d) { + offset = reinterpret_cast(reinterpret_cast(offset) + d); + return *this; +} +template +NCCL_HOST_DEVICE_INLINE ncclSymPtr& ncclSymPtr::operator+=(unsigned long long d) { + offset = reinterpret_cast(reinterpret_cast(offset) + d); + return *this; +} + +template +NCCL_HOST_DEVICE_INLINE ncclSymPtr& ncclSymPtr::operator-=(int d) { + offset = reinterpret_cast(reinterpret_cast(offset) - d); + return *this; +} +template +NCCL_HOST_DEVICE_INLINE ncclSymPtr& ncclSymPtr::operator-=(unsigned int d) { + offset = reinterpret_cast(reinterpret_cast(offset) - d); + return *this; +} + +template +NCCL_HOST_DEVICE_INLINE ncclSymPtr& ncclSymPtr::operator-=(long d) { + offset = reinterpret_cast(reinterpret_cast(offset) - d); + return *this; +} +template +NCCL_HOST_DEVICE_INLINE ncclSymPtr& ncclSymPtr::operator-=(unsigned long d) { + offset = reinterpret_cast(reinterpret_cast(offset) - d); + return *this; +} + +template +NCCL_HOST_DEVICE_INLINE ncclSymPtr& ncclSymPtr::operator-=(long long d) { + offset = reinterpret_cast(reinterpret_cast(offset) - d); + return *this; +} +template +NCCL_HOST_DEVICE_INLINE ncclSymPtr& ncclSymPtr::operator-=(unsigned long long d) { + offset = reinterpret_cast(reinterpret_cast(offset) - d); + return *this; +} + +#if __CUDACC__ +template +NCCL_DEVICE_INLINE T* ncclSymPtr::localPtr() const { + return (T*)ncclGetLocalPointer(window, offset); +} +#endif + +#if __CUDACC__ +template +NCCL_DEVICE_INLINE T* ncclSymPtr::lsaPtr(int peer) const { + return (T*)ncclGetLsaPointer(window, offset, peer); +} +#endif + +#if __CUDACC__ +template +NCCL_DEVICE_INLINE T* ncclSymPtr::peerPtr(int peer) const { + return (T*)ncclGetPeerPointer(window, offset, peer); +} +#endif + +#if __CUDACC__ +template +NCCL_DEVICE_INLINE T* ncclSymPtr::peerPtr(ncclTeam team, int peer) const { + return (T*)ncclGetPeerPointer(window, offset, team, peer); +} +#endif + +#if __CUDACC__ +template +NCCL_DEVICE_INLINE T* ncclSymPtr::multimemPtr(ncclMultimemHandle mmHandle) const { + return (T*)ncclGetMultimemPointer(window, offset, mmHandle); +} +#endif + +#if __CUDACC__ +template +NCCL_DEVICE_INLINE T* ncclSymPtr::lsaMultimemPtr(ncclDevComm const& comm) const { + return (T*)ncclGetLsaMultimemPointer(window, offset, comm); +} +#endif + +template +NCCL_HOST_DEVICE_INLINE ncclSymPtr operator+(ncclSymPtr p, Int d) { + return p += d; +} +template +NCCL_HOST_DEVICE_INLINE ncclSymPtr operator-(ncclSymPtr p, Int d) { + return p -= d; +} +template +NCCL_HOST_DEVICE_INLINE ptrdiff_t operator-(ncclSymPtr a, ncclSymPtr b) { + return reinterpret_cast(a.offset) - reinterpret_cast(b.offset); +} + +template +NCCL_HOST_DEVICE_INLINE bool operator==(ncclSymPtr a, ncclSymPtr b) { + return a.window == b.window && a.offset == b.offset; +} +template +NCCL_HOST_DEVICE_INLINE bool operator!=(ncclSymPtr a, ncclSymPtr b) { + return a.window != b.window || a.offset != b.offset; +} + +#endif // __cplusplus +#endif // _NCCL_DEVICE_PTR__FUNCS_H_ diff --git a/src/include/nccl_device/impl/ptr__types.h b/src/include/nccl_device/impl/ptr__types.h new file mode 100644 index 000000000..3f9a1a0f8 --- /dev/null +++ b/src/include/nccl_device/impl/ptr__types.h @@ -0,0 +1,11 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_PTR__TYPES_H_ +#define _NCCL_DEVICE_PTR__TYPES_H_ +#include "../ptr.h" +#include "core__types.h" +#endif // _NCCL_DEVICE_PTR__TYPES_H_ diff --git a/src/include/nccl_device/ll_a2a.h b/src/include/nccl_device/ll_a2a.h new file mode 100644 index 000000000..db3a517b7 --- /dev/null +++ b/src/include/nccl_device/ll_a2a.h @@ -0,0 +1,53 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_LL_A2A_H_ +#define _NCCL_DEVICE_LL_A2A_H_ +#include "impl/core__types.h" + +struct ncclLLA2AHandle; + +NCCL_EXTERN_C __host__ int ncclLLA2ACalcSlots(int maxElts, int maxEltSize); + +NCCL_EXTERN_C __host__ ncclResult_t ncclLLA2ACreateRequirement(int nBlocks, int nSlots, ncclLLA2AHandle_t* outHandle, ncclDevResourceRequirements_t* outReq); + +#if __CUDACC__ +template +struct ncclLLA2ASession_internal; + +template +struct ncclLLA2ASession: ncclLLA2ASession_internal { + NCCL_DEVICE_INLINE ncclLLA2ASession(Coop, ncclDevComm const&, ncclTeam, ncclLLA2AHandle, uint32_t block, int maxElts, bool multimem=false, ncclMultimemHandle mmHandle={}); + + NCCL_DEVICE_INLINE ~ncclLLA2ASession(); + + ncclLLA2ASession(ncclLLA2ASession const&) = delete; // Sessions are not copyable + + template + NCCL_DEVICE_INLINE void send(int peer, int slot, T data); + + template + NCCL_DEVICE_INLINE void bcast(int slot, T data); + + template + NCCL_DEVICE_INLINE T recv(int slot); + + template + NCCL_DEVICE_INLINE void recvUnrolled(int eltStart, int eltCount, int eltStride, T(&vals)[MaxEltCount]); + + template + NCCL_DEVICE_INLINE auto recvReduce(int eltStart, int eltCount, int eltStride, EltToAcc eltToAcc, Reduce red) + -> decltype(eltToAcc(nccl::utility::declval())) ; + + // End an alltoall region. For every peer in team you must have done both of the + // following each of which can be accomplished using any thread in coop: + // 1. Targeted that peer with at least one send(). + // 2. Received from a slot targeted by that peer. + NCCL_DEVICE_INLINE void endEpoch(Coop); +}; +#endif + +#endif // _NCCL_DEVICE_LL_A2A_H_ diff --git a/src/include/nccl_device/mem_barrier.h b/src/include/nccl_device/mem_barrier.h new file mode 100644 index 000000000..ea90cc6f6 --- /dev/null +++ b/src/include/nccl_device/mem_barrier.h @@ -0,0 +1,35 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_MEM_BARRIER_H_ +#define _NCCL_DEVICE_MEM_BARRIER_H_ +#include "impl/core__types.h" + +struct ncclLsaBarrierHandle; + +NCCL_EXTERN_C __host__ ncclResult_t ncclLsaBarrierCreateRequirement(ncclTeam_t, int nBarriers, ncclLsaBarrierHandle_t* outHandle, ncclDevResourceRequirements_t* outReq); + +#if __CUDACC__ +template +struct ncclLsaBarrierSession_internal; + +template +struct ncclLsaBarrierSession: ncclLsaBarrierSession_internal { + NCCL_DEVICE_INLINE ncclLsaBarrierSession(Coop, ncclDevComm const&, ncclTeam, ncclLsaBarrierHandle, uint32_t index, bool multimem=false, ncclMultimemHandle mmHandle={}); + + NCCL_DEVICE_INLINE ncclLsaBarrierSession(Coop, ncclDevComm const&, ncclTeamTagLsa, uint32_t index, bool multimem=false); + + NCCL_DEVICE_INLINE ~ncclLsaBarrierSession(); + + ncclLsaBarrierSession(ncclLsaBarrierSession const&) = delete; // Sessions are not copyable + + NCCL_DEVICE_INLINE void arrive(Coop, cuda::memory_order); + NCCL_DEVICE_INLINE void wait(Coop, cuda::memory_order); + NCCL_DEVICE_INLINE void sync(Coop, cuda::memory_order); +}; +#endif + +#endif // _NCCL_DEVICE_MEM_BARRIER_H_ diff --git a/src/include/nccl_device/ptr.h b/src/include/nccl_device/ptr.h new file mode 100644 index 000000000..4b8914c88 --- /dev/null +++ b/src/include/nccl_device/ptr.h @@ -0,0 +1,61 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_PTR_H_ +#define _NCCL_DEVICE_PTR_H_ +#include "core.h" +#include + +#if __cplusplus +template +struct ncclSymPtr { + using ElementType = T; + ncclWindow_t window; + size_t offset; + + NCCL_HOST_DEVICE_INLINE constexpr ncclSymPtr(ncclWindow_t window=nullptr, size_t offset=0); + + template + NCCL_HOST_DEVICE_INLINE operator ncclSymPtr() const; + + NCCL_HOST_DEVICE_INLINE ncclSymPtr& operator+=(int d); + NCCL_HOST_DEVICE_INLINE ncclSymPtr& operator+=(unsigned int d); + NCCL_HOST_DEVICE_INLINE ncclSymPtr& operator+=(long d); + NCCL_HOST_DEVICE_INLINE ncclSymPtr& operator+=(unsigned long d); + NCCL_HOST_DEVICE_INLINE ncclSymPtr& operator+=(long long d); + NCCL_HOST_DEVICE_INLINE ncclSymPtr& operator+=(unsigned long long d); + + NCCL_HOST_DEVICE_INLINE ncclSymPtr& operator-=(int d); + NCCL_HOST_DEVICE_INLINE ncclSymPtr& operator-=(unsigned int d); + NCCL_HOST_DEVICE_INLINE ncclSymPtr& operator-=(long d); + NCCL_HOST_DEVICE_INLINE ncclSymPtr& operator-=(unsigned long d); + NCCL_HOST_DEVICE_INLINE ncclSymPtr& operator-=(long long d); + NCCL_HOST_DEVICE_INLINE ncclSymPtr& operator-=(unsigned long long d); + + #if __CUDACC__ + NCCL_DEVICE_INLINE T* localPtr() const; + NCCL_DEVICE_INLINE T* lsaPtr(int peer) const; + NCCL_DEVICE_INLINE T* peerPtr(int peer) const; + NCCL_DEVICE_INLINE T* peerPtr(ncclTeam team, int peer) const; + NCCL_DEVICE_INLINE T* multimemPtr(ncclMultimemHandle mmHandle) const; + NCCL_DEVICE_INLINE T* lsaMultimemPtr(ncclDevComm const&) const; + #endif +}; + +template +NCCL_HOST_DEVICE_INLINE ncclSymPtr operator+(ncclSymPtr p, Int d); +template +NCCL_HOST_DEVICE_INLINE ncclSymPtr operator-(ncclSymPtr p, Int d); +template +NCCL_HOST_DEVICE_INLINE ptrdiff_t operator-(ncclSymPtr a, ncclSymPtr b); + +template +NCCL_HOST_DEVICE_INLINE ncclSymPtr operator==(ncclSymPtr a, ncclSymPtr b); +template +NCCL_HOST_DEVICE_INLINE ncclSymPtr operator!=(ncclSymPtr a, ncclSymPtr b); +#endif + +#endif diff --git a/src/include/nccl_device/utility.h b/src/include/nccl_device/utility.h new file mode 100644 index 000000000..b98a0d973 --- /dev/null +++ b/src/include/nccl_device/utility.h @@ -0,0 +1,352 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_UTILITY_H_ +#define _NCCL_DEVICE_UTILITY_H_ + +#if __CUDACC__ + #define NCCL_DEVICE_INLINE __device__ __forceinline__ + #define NCCL_HOST_DEVICE_INLINE __host__ __device__ __forceinline__ +#else + #ifndef __host__ + #define __host__ + #endif + #define NCCL_DEVICE_INLINE + #define NCCL_HOST_DEVICE_INLINE inline __attribute__((always_inline)) +#endif + +#if __cplusplus +#define NCCL_EXTERN_C extern "C" +#else +#define NCCL_EXTERN_C +#endif + +#include +#include + +#if __CUDACC__ +#include +#endif + +#if __cplusplus +namespace nccl { +namespace utility { + +template +T&& declval() noexcept { + static_assert(sizeof(T)!=sizeof(T), "You can't evaluate declval."); +} + +template +NCCL_HOST_DEVICE_INLINE constexpr Z divUp(X x, Y y) { + return (x+y-1)/y; +} + +template +NCCL_HOST_DEVICE_INLINE constexpr Z roundUp(X x, Y y) { + return (x+y-1) - (x+y-1)%y; +} +template +NCCL_HOST_DEVICE_INLINE constexpr Z roundDown(X x, Y y) { + return x - x%y; +} + +// assumes second argument is a power of 2 +template +NCCL_HOST_DEVICE_INLINE constexpr Z alignUp(X x, Y a) { + return (x + a-1) & -Z(a); +} +template +NCCL_HOST_DEVICE_INLINE T* alignUp(T* x, size_t a) { + static_assert(sizeof(T) == 1, "Only single byte types allowed."); + return reinterpret_cast((reinterpret_cast(x) + a-1) & -uintptr_t(a)); +} +template +NCCL_HOST_DEVICE_INLINE void* alignUp(void const* x, size_t a) { + return reinterpret_cast((reinterpret_cast(x) + a-1) & -uintptr_t(a)); +} + +// assumes second argument is a power of 2 +template +NCCL_HOST_DEVICE_INLINE constexpr Z alignDown(X x, Y a) { + return x & -Z(a); +} +template +NCCL_HOST_DEVICE_INLINE T* alignDown(T* x, size_t a) { + static_assert(sizeof(T) == 1, "Only single byte types allowed."); + return reinterpret_cast(reinterpret_cast(x) & -uintptr_t(a)); +} +template +NCCL_HOST_DEVICE_INLINE void* alignDown(void const* x, size_t a) { + return reinterpret_cast(reinterpret_cast(x) & -uintptr_t(a)); +} + +template +NCCL_HOST_DEVICE_INLINE T add4G(T base, int delta4G) { + union { uint32_t u32[2]; T tmp; }; + tmp = base; + u32[1] += delta4G; + return tmp; +} + + +template +NCCL_HOST_DEVICE_INLINE constexpr bool isPow2(Int x) { + return (x & (x-1)) == 0; +} + +// Produce the reciprocal of x for use in idivByRcp +NCCL_HOST_DEVICE_INLINE constexpr uint32_t idivRcp32(uint32_t x) { + return uint32_t(-1)/x + isPow2(x); +} +NCCL_HOST_DEVICE_INLINE constexpr uint64_t idivRcp64(uint64_t x) { + return uint64_t(-1)/x + isPow2(x); +} + +NCCL_HOST_DEVICE_INLINE uint32_t mul32hi(uint32_t a, uint32_t b) { +#if __CUDA_ARCH__ + return __umulhi(a, b); +#else + return uint64_t(a)*b >> 32; +#endif +} +NCCL_HOST_DEVICE_INLINE uint64_t mul64hi(uint64_t a, uint64_t b) { +#if __CUDA_ARCH__ + return __umul64hi(a, b); +#else + return (uint64_t)(((unsigned __int128)a)*b >> 64); +#endif +} + +// Produce the reciprocal of x*y given their respective reciprocals. This incurs +// no integer division on device. +NCCL_HOST_DEVICE_INLINE uint32_t imulRcp32(uint32_t x, uint32_t xrcp, uint32_t y, uint32_t yrcp) { + if (xrcp == 0) return yrcp; + if (yrcp == 0) return xrcp; + uint32_t rcp = mul32hi(xrcp, yrcp); + uint32_t rem = -x*y*rcp; + if (x*y <= rem) rcp += 1; + return rcp; +} +NCCL_HOST_DEVICE_INLINE uint64_t imulRcp64(uint64_t x, uint64_t xrcp, uint64_t y, uint64_t yrcp) { + if (xrcp == 0) return yrcp; + if (yrcp == 0) return xrcp; + uint64_t rcp = mul64hi(xrcp, yrcp); + uint64_t rem = -x*y*rcp; + if (x*y <= rem) rcp += 1; + return rcp; +} + +// Fast unsigned integer division where divisor has precomputed reciprocal. +// idivFast(x, y, idivRcp(y)) == x/y +NCCL_HOST_DEVICE_INLINE void idivmodFast32(uint32_t *quo, uint32_t *rem, uint32_t x, uint32_t y, uint32_t yrcp) { + uint32_t q = yrcp == 0 ? x : mul32hi(x, yrcp); + uint32_t r = x - y*q; + if (r >= y) { q += 1; r -= y; } + *quo = q; + *rem = r; +} +NCCL_HOST_DEVICE_INLINE void idivmodFast64(uint64_t *quo, uint64_t *rem, uint64_t x, uint64_t y, uint64_t yrcp) { + uint32_t q = yrcp == 0 ? x : mul64hi(x, yrcp); + uint32_t r = x - y*q; + if (r >= y) { q += 1; r -= y; } + *quo = q; + *rem = r; +} + +NCCL_HOST_DEVICE_INLINE uint32_t idivFast32(uint32_t x, uint32_t y, uint32_t yrcp) { + uint32_t q, r; + idivmodFast32(&q, &r, x, y, yrcp); + return q; +} +NCCL_HOST_DEVICE_INLINE uint32_t idivFast64(uint64_t x, uint64_t y, uint64_t yrcp) { + uint64_t q, r; + idivmodFast64(&q, &r, x, y, yrcp); + return q; +} + +NCCL_HOST_DEVICE_INLINE uint32_t imodFast32(uint32_t x, uint32_t y, uint32_t yrcp) { + uint32_t q, r; + idivmodFast32(&q, &r, x, y, yrcp); + return r; +} +NCCL_HOST_DEVICE_INLINE uint32_t imodFast64(uint64_t x, uint64_t y, uint64_t yrcp) { + uint64_t q, r; + idivmodFast64(&q, &r, x, y, yrcp); + return r; +} + +#if __CUDACC__ +// Precomputed integer reciprocoals for denominator values 1..64 inclusive. +// Pass these to idivFast64() for fast division on the GPU. +NCCL_DEVICE_INLINE uint64_t idivRcp64_upto64(int x) { + static constexpr uint64_t table[65] = { + idivRcp64(0x01), idivRcp64(0x01), idivRcp64(0x02), idivRcp64(0x03), + idivRcp64(0x04), idivRcp64(0x05), idivRcp64(0x06), idivRcp64(0x07), + idivRcp64(0x08), idivRcp64(0x09), idivRcp64(0x0a), idivRcp64(0x0b), + idivRcp64(0x0c), idivRcp64(0x0d), idivRcp64(0x0e), idivRcp64(0x0f), + idivRcp64(0x10), idivRcp64(0x11), idivRcp64(0x12), idivRcp64(0x13), + idivRcp64(0x14), idivRcp64(0x15), idivRcp64(0x16), idivRcp64(0x17), + idivRcp64(0x18), idivRcp64(0x19), idivRcp64(0x1a), idivRcp64(0x1b), + idivRcp64(0x1c), idivRcp64(0x1d), idivRcp64(0x1e), idivRcp64(0x1f), + idivRcp64(0x20), idivRcp64(0x21), idivRcp64(0x22), idivRcp64(0x23), + idivRcp64(0x24), idivRcp64(0x25), idivRcp64(0x26), idivRcp64(0x27), + idivRcp64(0x28), idivRcp64(0x29), idivRcp64(0x2a), idivRcp64(0x2b), + idivRcp64(0x2c), idivRcp64(0x2d), idivRcp64(0x2e), idivRcp64(0x2f), + idivRcp64(0x30), idivRcp64(0x31), idivRcp64(0x32), idivRcp64(0x33), + idivRcp64(0x34), idivRcp64(0x35), idivRcp64(0x36), idivRcp64(0x37), + idivRcp64(0x38), idivRcp64(0x39), idivRcp64(0x3a), idivRcp64(0x3b), + idivRcp64(0x3c), idivRcp64(0x3d), idivRcp64(0x3e), idivRcp64(0x3f), + idivRcp64(0x40) + }; + return table[x]; +} +#endif + +#if __CUDACC__ +NCCL_DEVICE_INLINE uint32_t idivRcp32_upto64(int x) { + return idivRcp64_upto64(x)>>32; +} +#endif + +#if __CUDACC__ +NCCL_DEVICE_INLINE void fenceAcquireGpu() { + static __device__ int dummy; + int tmp; + asm volatile("ld.acquire.gpu.s32 %0,[%1];" : "=r"(tmp) : "l"(&dummy) : "memory"); + dummy = tmp; +} +NCCL_DEVICE_INLINE void fenceReleaseGpu() { + cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_device); +} +#endif + +#if __CUDACC__ +NCCL_DEVICE_INLINE cuda::memory_order acquireOrderOf(cuda::memory_order ord) { + return ord == cuda::memory_order_release ? cuda::memory_order_relaxed : + ord == cuda::memory_order_acq_rel ? cuda::memory_order_acquire : + ord; +} +NCCL_DEVICE_INLINE cuda::memory_order releaseOrderOf(cuda::memory_order ord) { + return ord == cuda::memory_order_acquire ? cuda::memory_order_relaxed : + ord == cuda::memory_order_acq_rel ? cuda::memory_order_release : + ord; +} +#endif + +#if __CUDACC__ +NCCL_DEVICE_INLINE int lane() { + int ret; + asm("mov.u32 %0, %%laneid;" : "=r"(ret)); + return ret; +} +NCCL_DEVICE_INLINE unsigned int lanemask_lt() { + unsigned int ret; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(ret)); + return ret; +} +#endif + +#if __CUDACC__ +// Load anything, but cache like its constant memory. +template +NCCL_DEVICE_INLINE T loadConst(T const *p) { + if (alignof(T) == 1) { + union { uint8_t part[sizeof(T)]; T ret; }; + for (int i=0; i < (int)sizeof(T); i++) part[i] = __ldg((uint8_t const*)p + i); + return ret; + } else if (alignof(T) == 2) { + union { uint16_t part[sizeof(T)/2]; T ret; }; + for (int i=0; i < (int)sizeof(T)/2; i++) part[i] = __ldg((uint16_t const*)p + i); + return ret; + } else if (alignof(T) == 4) { + union { uint32_t part[sizeof(T)/4]; T ret; }; + for (int i=0; i < (int)sizeof(T)/4; i++) part[i] = __ldg((uint32_t const*)p + i); + return ret; + } else if (alignof(T) == 8) { + union { uint64_t part[sizeof(T)/8]; T ret; }; + for (int i=0; i < (int)sizeof(T)/8; i++) part[i] = __ldg((uint64_t const*)p + i); + return ret; + } else { // alignof(T) >= 16 + union { ulonglong2 part[sizeof(T)/16]; T ret; }; + for (int i=0; i < (int)sizeof(T)/16; i++) part[i] = __ldg((ulonglong2 const*)p + i); + return ret; + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Optional: Holds a T that may or may not be constructed. An Optional +// constructed with a Present will have its T constructed via the +// T::T(Arg...) constructor. An Optional constructed with a Absent will not +// have its T constructed. + +template +struct IntSeq {}; + +template +struct IntSeqUpTo: IntSeqUpTo {}; +template +struct IntSeqUpTo { using Type = IntSeq; }; + +// Present: Packs a list of arguments together to be passed to Optional. +template +struct Present; +template<> +struct Present<> {}; +template +struct Present { + H h; + Present t; + + NCCL_HOST_DEVICE_INLINE H get(IntSeq<0>) { + return static_cast(h); + } + template + NCCL_HOST_DEVICE_INLINE decltype(auto) get(IntSeq) { + return t.get(IntSeq{}); + } +}; + +NCCL_HOST_DEVICE_INLINE Present<> present() { + return Present<>{}; +} +template +NCCL_HOST_DEVICE_INLINE Present present(H&& h, T&& ...t) { + return Present{static_cast(h), present(static_cast(t)...)}; +} + +struct Absent {}; + +template +struct Optional { + bool present; // Is `thing` constructed. + union { T thing; }; + + // Construct with absent thing: + NCCL_HOST_DEVICE_INLINE constexpr Optional(): present(false) {} + NCCL_HOST_DEVICE_INLINE constexpr Optional(Absent): present(false) {} + + // Helper constructor + template + NCCL_HOST_DEVICE_INLINE Optional(Present args, IntSeq): + present(true), + thing{args.get(IntSeq())...} { + } + // Construct with present thing: + template + NCCL_HOST_DEVICE_INLINE Optional(Present args): + Optional(args, IntSeqUpTo::Type()) { + } + + NCCL_HOST_DEVICE_INLINE ~Optional() { + if (present) thing.~T(); + } +}; + +}} +#endif // __cplusplus +#endif diff --git a/src/include/net.h b/src/include/net.h index 552e9bcb4..f13eebb06 100644 --- a/src/include/net.h +++ b/src/include/net.h @@ -12,10 +12,16 @@ #include "comm.h" #include "checks.h" +#define NCCL_UNDEF_DEV_COUNT -1 + typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; ncclResult_t ncclNetInit(struct ncclComm* comm); ncclResult_t ncclNetFinalize(struct ncclComm* comm); +ncclResult_t ncclNetGetDevCount(int netPluginIndex, int* nPhysDev, int* nVirtDev); +ncclResult_t ncclNetSetVirtDevCount(int netPluginIndex, int nVirtDev); +ncclResult_t ncclCollNetGetDevCount(int netPluginIndex, int* nPhysDev, int* nVirtDev); +ncclResult_t ncclCollNetSetVirtDevCount(int netPluginIndex, int nVirtDev); // Test whether the current GPU support GPU Direct RDMA. ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport); diff --git a/src/include/net_device.h b/src/include/net_device.h index c3a79e35c..99ae9c38b 100644 --- a/src/include/net_device.h +++ b/src/include/net_device.h @@ -12,7 +12,7 @@ // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version. -#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7 +#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7 typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType; @@ -27,6 +27,7 @@ typedef struct { typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t; typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t; typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t; -typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t; +typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_v11_t; +typedef ncclNetDeviceHandle_v11_t ncclNetDeviceHandle_t; #endif diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h index 72fbf9ce2..ce8925ef9 100644 --- a/src/include/nvmlwrap.h +++ b/src/include/nvmlwrap.h @@ -253,6 +253,24 @@ typedef nvmlGpuFabricInfo_v2_t nvmlGpuFabricInfoV_t; */ #define nvmlGpuFabricInfo_v2 NVML_STRUCT_VERSION(GpuFabricInfo, 2) +/** + * Structure to store platform information (v2) + */ +typedef struct +{ + unsigned int version; //!< the API version number + unsigned char ibGuid[16]; //!< Infiniband GUID reported by platform (for Blackwell, ibGuid is 8 bytes so indices 8-15 are zero) + unsigned char chassisSerialNumber[16]; //!< Serial number of the chassis containing this GPU (for Blackwell it is 13 bytes so indices 13-15 are zero) + unsigned char slotNumber; //!< The slot number in the chassis containing this GPU (includes switches) + unsigned char trayIndex; //!< The tray index within the compute slots in the chassis containing this GPU (does not include switches) + unsigned char hostId; //!< Index of the node within the slot containing this GPU + unsigned char peerType; //!< Platform indicated NVLink-peer type (e.g. switch present or not) + unsigned char moduleId; //!< ID of this GPU within the node +} nvmlPlatformInfo_v2_t; + +typedef nvmlPlatformInfo_v2_t nvmlPlatformInfo_t; +#define nvmlPlatformInfo_v2 NVML_STRUCT_VERSION(PlatformInfo, 2) + /** * Confidential Compute Feature Status values */ @@ -270,6 +288,7 @@ typedef struct nvmlConfComputeSystemState_st { */ #define NVML_CC_SYSTEM_MULTIGPU_NONE 0 #define NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE 1 +#define NVML_CC_SYSTEM_MULTIGPU_NVLE 2 /** * Confidential Compute System settings @@ -303,6 +322,7 @@ extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMa struct ncclNvmlCCStatus { bool CCEnabled; bool multiGpuProtectedPCIE; + bool multiGpuNVLE; }; // All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly. @@ -320,6 +340,7 @@ ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* ma ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus); ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo); +ncclResult_t ncclNvmlDeviceGetPlatformInfo(nvmlDevice_t device, nvmlPlatformInfo_t *plaformInfo); ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status); #endif // End include guard diff --git a/src/include/nvtx.h b/src/include/nvtx.h index de50dfe2e..8f20be43d 100644 --- a/src/include/nvtx.h +++ b/src/include/nvtx.h @@ -9,6 +9,8 @@ #include "nvtx3/nvtx3.hpp" +#include "param.h" + #if __cpp_constexpr >= 201304L && !defined(NVTX3_CONSTEXPR_IF_CPP14) #define NVTX3_CONSTEXPR_IF_CPP14 constexpr #else @@ -32,15 +34,20 @@ #define NVTX_SID_CommSplit 13 #define NVTX_SID_CommFinalize 14 #define NVTX_SID_CommShrink 15 +#define NVTX_SID_AlltoAll 16 +#define NVTX_SID_Gather 17 +#define NVTX_SID_Scatter 18 // When adding new schema IDs, DO NOT re-use/overlap with the enum schema ID below! // Define static schema ID for the reduction operation. -#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 16 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START +#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 19 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START extern const nvtxDomainHandle_t ncclNvtxDomainHandle; struct nccl_domain{static constexpr char const* name{"NCCL"};}; +extern int64_t ncclParamNvtxDisable(); + /// @brief Register an NVTX payload schema for static-size payloads. class payload_schema { public: @@ -74,6 +81,32 @@ class payload_schema { nullptr, 0, 0, 0, 0, nullptr}; }; +class ncclOptionalNvtxScopedRange +{ + public: + void push(const nvtx3::event_attributes& attr) noexcept { + // pushed must not be true already, but it's too expensive to check + pushed = true; + nvtxDomainRangePushEx(nvtx3::domain::get(), attr.get()); + } + + ~ncclOptionalNvtxScopedRange() noexcept { + if (!pushed) { + return; + } + nvtxDomainRangePop(nvtx3::domain::get()); + } + + ncclOptionalNvtxScopedRange() = default; + ncclOptionalNvtxScopedRange(ncclOptionalNvtxScopedRange const&) = delete; + ncclOptionalNvtxScopedRange& operator=(ncclOptionalNvtxScopedRange const&) = delete; + ncclOptionalNvtxScopedRange(ncclOptionalNvtxScopedRange&&) = delete; + ncclOptionalNvtxScopedRange& operator=(ncclOptionalNvtxScopedRange&&) = delete; + + private: + bool pushed = false; +}; + // Convenience macro to give the payload parameters a scope. #define NVTX3_PAYLOAD(...) __VA_ARGS__ @@ -81,26 +114,43 @@ class payload_schema { // @param N NCCL API name without the `nccl` prefix. // @param T name of the used NVTX payload schema without "Schema" suffix. // @param P payload parameters/entries -#define NVTX3_FUNC_WITH_PARAMS(N, T, P) \ - constexpr uint64_t schemaId = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \ - static const payload_schema schema{T##Schema, std::extent::value - 1, \ - schemaId, sizeof(T)}; \ - static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ - const T _payload = {P}; \ - nvtxPayloadData_t nvtx3_bpl__[] = {{schemaId, sizeof(_payload), &_payload}}; \ - ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \ - ::nvtx3::v1::scoped_range_in const nvtx3_range__{nvtx3_func_attr__}; +#define NVTX3_FUNC_WITH_PARAMS(N, T, P) \ + ncclOptionalNvtxScopedRange nvtx3_range__; \ + if (!ncclParamNvtxDisable()) \ + { \ + constexpr uint64_t schemaId = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \ + static const payload_schema \ + schema{T##Schema, std::extent::value - 1, schemaId, sizeof(T)}; \ + static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ + const T _payload = {P}; \ + nvtxPayloadData_t nvtx3_bpl__[] = {{schemaId, sizeof(_payload), &_payload}}; \ + ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \ + nvtx3_range__.push(nvtx3_func_attr__); \ + } + +#define NCCL_NVTX3_FUNC_RANGE \ + ncclOptionalNvtxScopedRange nvtx3_range__; \ + if (!ncclParamNvtxDisable()) { \ + static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ + static ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \ + nvtx3_range__.push(nvtx3_func_attr__); \ + } /// @brief Creates an NVTX range with extended payload using the RAII pattern. /// @tparam PayloadType Data type of the payload. template -class ncclNvtxRange { +class ncclOptionalNvtxPayloadRange { public: - explicit ncclNvtxRange(const nvtxEventAttributes_t* evtAttr) noexcept { - nvtxDomainRangePushEx(nvtx3::domain::get(), evtAttr); + void push(const nvtx3::event_attributes& attr) noexcept { + // pushed must not be true already, but it's too expensive to check + pushed = true; + nvtxDomainRangePushEx(nvtx3::domain::get(), attr.get()); } - ~ncclNvtxRange() noexcept { + ~ncclOptionalNvtxPayloadRange() noexcept { + if (!pushed) { + return; + } if (payloadData.payload) { nvtxRangePopPayload(nvtx3::domain::get(), &payloadData, 1); } else { @@ -113,25 +163,34 @@ class ncclNvtxRange { payloadData = {schemaId, sizeof(PayloadType), &payload}; } - ncclNvtxRange() = delete; - ncclNvtxRange(ncclNvtxRange const&) = default; - ncclNvtxRange& operator=(ncclNvtxRange const&) = default; - ncclNvtxRange(ncclNvtxRange&&) = default; - ncclNvtxRange& operator=(ncclNvtxRange&&) = default; + ncclOptionalNvtxPayloadRange() = default; + ncclOptionalNvtxPayloadRange(ncclOptionalNvtxPayloadRange const&) = delete; + ncclOptionalNvtxPayloadRange& operator=(ncclOptionalNvtxPayloadRange const&) = delete; + ncclOptionalNvtxPayloadRange(ncclOptionalNvtxPayloadRange&&) = delete; + ncclOptionalNvtxPayloadRange& operator=(ncclOptionalNvtxPayloadRange&&) = delete; // Holds the payload data. PayloadType payload{}; + bool isPushed() const noexcept { + return pushed; + } + private: + bool pushed = false; nvtxPayloadData_t payloadData = {NVTX_PAYLOAD_ENTRY_TYPE_INVALID, 0, NULL}; }; // Create an NVTX range with the function name as the range name. Use RAII pattern. // @param T Type ID of the NVTX payload (pointer for variable-size payloads). -#define NVTX3_RANGE(T) \ - static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ - ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \ - ncclNvtxRange nvtx3_range__{nvtx3_func_attr__.get()}; +#define NVTX3_RANGE(T) \ + ncclOptionalNvtxPayloadRange nvtx3_range__; \ + if (!ncclParamNvtxDisable()) \ + { \ + static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ + ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \ + nvtx3_range__.push(nvtx3_func_attr__); \ + } // Add static-size payload to the NVTX range created with `NVTX3_RANGE()`, // which must be in this or an outer scope. @@ -139,6 +198,9 @@ class ncclNvtxRange { // @param S name of the used NVTX payload schema. // @param P payload parameters/entries #define NVTX3_RANGE_ADD_PAYLOAD(N, S, P) do { \ + if (!nvtx3_range__.isPushed()) { \ + break; \ + } \ constexpr uint64_t schema_id = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \ static const payload_schema schema{S, std::extent::value - 1, schema_id, \ sizeof(nvtx3_range__.payload)}; \ diff --git a/src/include/nvtx3/nvToolsExtCounters.h b/src/include/nvtx3/nvToolsExtCounters.h index 00e2b7f8f..e24ab0e04 100644 --- a/src/include/nvtx3/nvToolsExtCounters.h +++ b/src/include/nvtx3/nvToolsExtCounters.h @@ -332,4 +332,4 @@ NVTX_DECLSPEC void NVTX_API nvtxCountersSubmitBatchEx( } #endif /* __cplusplus */ -#endif /* NVTOOLSEXT_COUNTERS_H */ \ No newline at end of file +#endif /* NVTOOLSEXT_COUNTERS_H */ diff --git a/src/include/nvtx3/nvToolsExtSemanticsCounters.h b/src/include/nvtx3/nvToolsExtSemanticsCounters.h index f97624a07..6334bfc5d 100644 --- a/src/include/nvtx3/nvToolsExtSemanticsCounters.h +++ b/src/include/nvtx3/nvToolsExtSemanticsCounters.h @@ -85,4 +85,4 @@ typedef struct nvtxSemanticsCounter_v1 { } limits; } nvtxSemanticsCounter_t; -#endif /* NVTX_SEMANTIC_ID_COUNTERS_V1 */ \ No newline at end of file +#endif /* NVTX_SEMANTIC_ID_COUNTERS_V1 */ diff --git a/src/include/nvtx3/nvToolsExtSemanticsScope.h b/src/include/nvtx3/nvToolsExtSemanticsScope.h index eed6f3095..e6d1c5f26 100644 --- a/src/include/nvtx3/nvToolsExtSemanticsScope.h +++ b/src/include/nvtx3/nvToolsExtSemanticsScope.h @@ -27,4 +27,4 @@ typedef struct nvtxSemanticsScope_v1 uint64_t scopeId; } nvtxSemanticsScope_t; -#endif /* NVTX_SEMANTIC_ID_SCOPE_V1 */ \ No newline at end of file +#endif /* NVTX_SEMANTIC_ID_SCOPE_V1 */ diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h b/src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h index 00fc81768..6fca4801b 100644 --- a/src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h +++ b/src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h @@ -28,4 +28,4 @@ #define NVTX_EXT_HELPER_UNUSED_ARGS(...) \ NVTX_EXT_CONCAT(_NVTX_EXT_VOIDIFY, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) -#endif /* NVTX_EXT_HELPER_MACROS_H */ \ No newline at end of file +#endif /* NVTX_EXT_HELPER_MACROS_H */ diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtImpl.h b/src/include/nvtx3/nvtxDetail/nvtxExtImpl.h index 79bb0c1c5..56dcab692 100644 --- a/src/include/nvtx3/nvtxDetail/nvtxExtImpl.h +++ b/src/include/nvtx3/nvtxDetail/nvtxExtImpl.h @@ -96,4 +96,4 @@ NVTX_LINKONCE_DEFINE_GLOBAL nvtxExtGlobals1_t NVTX_VERSIONED_IDENTIFIER(nvtxExtG } /* extern "C" */ #endif /* __cplusplus */ -#endif /* NVTX_EXT_IMPL_H */ \ No newline at end of file +#endif /* NVTX_EXT_IMPL_H */ diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h b/src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h index 0f6ff9667..c34fa8392 100644 --- a/src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h +++ b/src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h @@ -145,4 +145,4 @@ NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSubmitBatchEx, } /* extern "C" */ #endif /* __cplusplus */ -#endif /* NVTX_EXT_IMPL_COUNTERS_V1 */ \ No newline at end of file +#endif /* NVTX_EXT_IMPL_COUNTERS_V1 */ diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h b/src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h index 71e30bc37..9d07f5b1f 100644 --- a/src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h +++ b/src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h @@ -269,4 +269,4 @@ /*** END: Helper for `NVTX_PAYLOAD_STATIC_SCHEMA_{INIT,CREATE}` */ -#endif /* NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H */ \ No newline at end of file +#endif /* NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H */ diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h b/src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h index 6a30e6633..eeb227a5a 100644 --- a/src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h +++ b/src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h @@ -148,4 +148,4 @@ NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadTypeInfo)[NVTX_PAYLOAD_ENTRY_TYPE_IN }; #undef nvtx_alignof -#undef nvtx_alignof2 \ No newline at end of file +#undef nvtx_alignof2 diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtTypes.h b/src/include/nvtx3/nvtxDetail/nvtxExtTypes.h index bcad095a0..6be0ac796 100644 --- a/src/include/nvtx3/nvtxDetail/nvtxExtTypes.h +++ b/src/include/nvtx3/nvtxDetail/nvtxExtTypes.h @@ -41,4 +41,4 @@ typedef struct nvtxExtModuleInfo_t typedef int (NVTX_API * NvtxExtInitializeInjectionFunc_t)(nvtxExtModuleInfo_t* moduleInfo); -#endif /* NVTXEXTTYPES_H */ \ No newline at end of file +#endif /* NVTXEXTTYPES_H */ diff --git a/src/include/nvtx_payload_schemas.h b/src/include/nvtx_payload_schemas.h index 89a41d4b5..587c1a2a4 100644 --- a/src/include/nvtx_payload_schemas.h +++ b/src/include/nvtx_payload_schemas.h @@ -90,6 +90,13 @@ NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllGather, static cons ) ) +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAlltoAll, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), + (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr) + ) +) + NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllReduce, static constexpr, NCCL_NVTX_PAYLOAD_ENTRIES( (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), @@ -106,6 +113,14 @@ NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsBroadcast, static cons ) ) +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsGather, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), + (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr), + (int, root, TYPE_INT, "Root") + ) +) + NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsReduce, static constexpr, NCCL_NVTX_PAYLOAD_ENTRIES( (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), @@ -123,6 +138,14 @@ NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsReduceScatter, static ) ) +NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsScatter, static constexpr, + NCCL_NVTX_PAYLOAD_ENTRIES( + (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr), + (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr), + (int, root, TYPE_INT, "Root") + ) +) + // Used in NCCL APIs `ncclSend` and `ncclRecv`. NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsSendRecv, static constexpr, NCCL_NVTX_PAYLOAD_ENTRIES( diff --git a/src/include/plugin/nccl_net.h b/src/include/plugin/nccl_net.h index 18d1486d7..d92a21b4e 100644 --- a/src/include/plugin/nccl_net.h +++ b/src/include/plugin/nccl_net.h @@ -16,6 +16,7 @@ //Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) #define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1 +#define NCCL_NET_MULTI_REQUEST 0x2 #define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two. #define MAX_COLLNET_SIZE (512*1024*1024L) //Set for initial collent plugins when size was not dynamically queried @@ -32,22 +33,24 @@ #define NCCL_NET_MAX_PLUGINS 16 #endif +#define NCCL_NET_MAX_DEVS_PER_NIC 4 + +#include "net/net_v11.h" #include "net/net_v10.h" #include "net/net_v9.h" #include "net/net_v8.h" #include "net/net_v7.h" #include "net/net_v6.h" -typedef ncclNet_v10_t ncclNet_t; -typedef ncclCollNet_v10_t ncclCollNet_t; -typedef ncclNetSGE_v10_t ncclNetSGE_t; -typedef ncclNetProperties_v10_t ncclNetProperties_t; -typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t; -typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t; - -#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V10 +typedef ncclNet_v11_t ncclNet_t; +typedef ncclCollNet_v11_t ncclCollNet_t; +typedef ncclNetSGE_v11_t ncclNetSGE_t; +typedef ncclNetProperties_v11_t ncclNetProperties_t; +typedef ncclNetAttr_v11_t ncclNetAttr_t; +typedef ncclNetVDeviceProps_v11_t ncclNetVDeviceProps_t; +typedef ncclNetCommConfig_v11_t ncclNetCommConfig_t; -#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v10 -#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v10 +#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v11 +#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v11 #endif // end include guard diff --git a/src/include/plugin/nccl_profiler.h b/src/include/plugin/nccl_profiler.h index 710aac4d5..5ce77c3ee 100644 --- a/src/include/plugin/nccl_profiler.h +++ b/src/include/plugin/nccl_profiler.h @@ -8,14 +8,18 @@ #define NCCL_PROFILER_H_ enum { - ncclProfileGroup = (1 << 0), // group event type - ncclProfileColl = (1 << 1), // host collective call event type - ncclProfileP2p = (1 << 2), // host point-to-point call event type - ncclProfileProxyOp = (1 << 3), // proxy operation event type - ncclProfileProxyStep = (1 << 4), // proxy step event type - ncclProfileProxyCtrl = (1 << 5), // proxy control event type - ncclProfileKernelCh = (1 << 6), // kernel channel event type - ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events + ncclProfileGroup = (1 << 0), // group event type + ncclProfileColl = (1 << 1), // host collective call event type + ncclProfileP2p = (1 << 2), // host point-to-point call event type + ncclProfileProxyOp = (1 << 3), // proxy operation event type + ncclProfileProxyStep = (1 << 4), // proxy step event type + ncclProfileProxyCtrl = (1 << 5), // proxy control event type + ncclProfileKernelCh = (1 << 6), // kernel channel event type + ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events + ncclProfileGroupApi = (1 << 8), // Group API events + ncclProfileCollApi = (1 << 9), // Collective API events + ncclProfileP2pApi = (1 << 10), // Point-to-Point API events + ncclProfileKernelLaunch = (1 << 11), // Kernel launch events }; typedef enum { @@ -50,22 +54,28 @@ typedef enum { /* Kernel event states */ ncclProfilerKernelChStop = 22, + + /* Group API States */ + ncclProfilerGroupStartApiStop = 23, + ncclProfilerGroupEndApiStart = 24 } ncclProfilerEventState_t; typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t; typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t; typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t; typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t; +typedef ncclProfilerEventState_t ncclProfilerEventState_v5_t; #include +#include "profiler/profiler_v5.h" #include "profiler/profiler_v4.h" #include "profiler/profiler_v3.h" #include "profiler/profiler_v2.h" #include "profiler/profiler_v1.h" -typedef ncclProfiler_v4_t ncclProfiler_t; -typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t; -typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t; +typedef ncclProfiler_v5_t ncclProfiler_t; +typedef ncclProfilerEventDescr_v5_t ncclProfilerEventDescr_t; +typedef ncclProfilerEventStateArgs_v5_t ncclProfilerEventStateArgs_t; #define NCCL_PROFILER_NET_VER_BITS (16) #define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS) diff --git a/src/include/plugin/nccl_tuner.h b/src/include/plugin/nccl_tuner.h index f2401890d..fbd87b58f 100644 --- a/src/include/plugin/nccl_tuner.h +++ b/src/include/plugin/nccl_tuner.h @@ -11,12 +11,49 @@ #include "nccl.h" #include "nccl_common.h" +#include "tuner/tuner_v5.h" #include "tuner/tuner_v4.h" #include "tuner/tuner_v3.h" #include "tuner/tuner_v2.h" -typedef ncclTuner_v4_t ncclTuner_t; +typedef ncclTuner_v5_t ncclTuner_t; +typedef ncclTunerConstants_v5_t ncclTunerConstants_t; +typedef ncclNvlDomainInfo_v5_t ncclNvlDomainInfo_t; -#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4" +#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v5" + +#define NCCL_ALGO_UNDEF -1 +#define NCCL_ALGO_TREE 0 +#define NCCL_ALGO_RING 1 +#define NCCL_ALGO_COLLNET_DIRECT 2 +#define NCCL_ALGO_COLLNET_CHAIN 3 +#define NCCL_ALGO_NVLS 4 +#define NCCL_ALGO_NVLS_TREE 5 +#define NCCL_ALGO_PAT 6 +#define NCCL_NUM_ALGORITHMS NCCL_NUM_ALGORITHMS_V5 // Tree/Ring/CollNet*/PAT + +#define NCCL_PROTO_UNDEF -1 +#define NCCL_PROTO_LL 0 +#define NCCL_PROTO_LL128 1 +#define NCCL_PROTO_SIMPLE 2 +#define NCCL_NUM_PROTOCOLS NCCL_NUM_PROTOCOLS_V5 // Simple/LL/LL128 + +#define NCCL_ALGO_PROTO_IGNORE -1.0 + +#define NCCL_HW_NVLINK 0 +#define NCCL_HW_PCI 1 +#define NCCL_HW_NET 2 +#define NCCL_NUM_HW_LINKS NCCL_NUM_HW_LINKS_V5 + +#define NCCL_VOLTA_COMPCAP_IDX 0 +#define NCCL_AMPERE_COMPCAP_IDX 1 +#define NCCL_HOPPER_COMPCAP_IDX 2 +#define NCCL_BLACKWELL_COMPCAP_IDX 3 +#define NCCL_NUM_COMPCAPS NCCL_NUM_COMPCAPS_V5 + +#define NCCL_TUNING_SCALE_1NODE 0 +#define NCCL_TUNING_SCALE_2NODES 1 +#define NCCL_TUNING_SCALE_4NODES 2 +#define NCCL_NUM_TUNING_SCALES NCCL_NUM_TUNING_SCALES_V5 #endif diff --git a/src/include/plugin/net/net_v10.h b/src/include/plugin/net/net_v10.h index ada6d482e..2e9187b0a 100644 --- a/src/include/plugin/net/net_v10.h +++ b/src/include/plugin/net/net_v10.h @@ -5,11 +5,9 @@ #ifndef NET_V10_H_ #define NET_V10_H_ -#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4 - typedef struct { int ndevs; - int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10]; + int devs[NCCL_NET_MAX_DEVS_PER_NIC]; } ncclNetVDeviceProps_v10_t; #define NCCL_NET_TRAFFIC_CLASS_UNDEF -1 diff --git a/src/include/plugin/net/net_v11.h b/src/include/plugin/net/net_v11.h new file mode 100644 index 000000000..68e100637 --- /dev/null +++ b/src/include/plugin/net/net_v11.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + */ + +#ifndef NET_V11_H_ +#define NET_V11_H_ + +typedef struct { + int ndevs; + int devs[NCCL_NET_MAX_DEVS_PER_NIC]; +} ncclNetVDeviceProps_v11_t; + +#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1 + +typedef struct { + // Plugin-specific TC value + int trafficClass; +} ncclNetCommConfig_v11_t; + + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int regIsGlobal; // regMr is not tied to a particular comm + int forceFlush; // Force a flush on receives + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload + ncclNetVDeviceProps_v11_t vProps; + size_t maxP2pBytes; // Max transfer size for point-to-point operations + size_t maxCollBytes; // Max transfer size for collective operations + int maxMultiRequestSize; // Maximum number of requests supported in a single multi-request. +} ncclNetProperties_v11_t; + +#define NCCL_NET_ATTR_UNDEF -1 + +#define NCCL_NET_ATTR_INIT { \ + { NCCL_NET_ATTR_UNDEF, NCCL_NET_ATTR_UNDEF, NCCL_NET_ATTR_UNDEF, NCCL_NET_ATTR_UNDEF }, /* sendCommAttr */ \ + { NCCL_NET_ATTR_UNDEF, NCCL_NET_ATTR_UNDEF, NCCL_NET_ATTR_UNDEF, NCCL_NET_ATTR_UNDEF }, /* recvCommAttr */ \ + (uint32_t)NCCL_NET_ATTR_UNDEF, /* op */ \ + (uint32_t)NCCL_NET_ATTR_UNDEF, /* algo */ \ + (uint32_t)NCCL_NET_ATTR_UNDEF, /* proto */ \ +} + +typedef struct { + int32_t maxConcurrentPeers; + int32_t minConcurrentPeers; + int32_t maxFlowsPerPeer; + int32_t minFlowsPerPeer; +} ncclNetCommAttr_v11_t; + +typedef struct { + ncclNetCommAttr_v11_t sendCommAttr; + ncclNetCommAttr_v11_t recvCommAttr; + uint32_t op; + uint32_t algo; + uint32_t proto; +} ncclNetAttr_v11_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(void** ctx, uint64_t commId, ncclNetCommConfig_v11_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(void* ctx, int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); + + // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller + // what index this new vNIC exists at + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v11_t* props); + // Finalize the network. + ncclResult_t (*finalize)(void* ctx); + + ncclResult_t (*setNetAttr)(void* ctx, ncclNetAttr_v11_t* netAttr); +} ncclNet_v11_t; + +typedef struct { + void* mhandle; + void* address; + size_t size; +} ncclNetSGE_v11_t; + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(void** ctx, uint64_t commId, ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(void* ctx, int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v11_t* recvParts, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + void* sendMhandle, void** request); + ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v11_t* sendParts, void* recvData, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + ncclDataType_t dataType, ncclRedOp_t redOp, + void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Create a virtual NIC given the specified properties, which can be accessed at device index d + ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v11_t* props); + // Finalize the collective network. + ncclResult_t (*finalize)(void* ctx); +} ncclCollNet_v11_t; + +#endif // end include guard diff --git a/src/include/plugin/net/net_v9.h b/src/include/plugin/net/net_v9.h index ce9d91748..ef054bbe6 100644 --- a/src/include/plugin/net/net_v9.h +++ b/src/include/plugin/net/net_v9.h @@ -7,11 +7,9 @@ #ifndef NET_V9_H_ #define NET_V9_H_ -#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4 - typedef struct { int ndevs; - int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9]; + int devs[NCCL_NET_MAX_DEVS_PER_NIC]; } ncclNetVDeviceProps_v9_t; typedef struct { diff --git a/src/include/plugin/plugin.h b/src/include/plugin/plugin.h index 300e436a0..83b58e985 100644 --- a/src/include/plugin/plugin.h +++ b/src/include/plugin/plugin.h @@ -21,4 +21,6 @@ void* ncclOpenProfilerPluginLib(const char* name); void* ncclGetNetPluginLib(enum ncclPluginType type); ncclResult_t ncclClosePluginLib(void* handle, enum ncclPluginType type); +extern char* ncclPluginLibPaths[]; + #endif diff --git a/src/include/plugin/profiler/profiler_v5.h b/src/include/plugin/profiler/profiler_v5.h new file mode 100644 index 000000000..dab1db9e1 --- /dev/null +++ b/src/include/plugin/profiler/profiler_v5.h @@ -0,0 +1,151 @@ +/************************************************************************* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef PROFILER_V5_H_ +#define PROFILER_V5_H_ + +typedef struct { + uint64_t type; // event type descriptor: ncclProfileColl, ... + void* parentObj; // pointer to the profiler parent object (for coll is the group) + int rank; // originating rank + union { + struct { + bool graphCaptured; + int groupDepth; + } groupApi; + + struct { + const char* func; + size_t count; + const char* datatype; + int root; + void* stream; + bool graphCaptured; + } collApi; + + struct { + const char* func; + size_t count; + const char* datatype; + void* stream; + bool graphCaptured; + } p2pApi; + + struct { + void* stream; + } kernelLaunch; + + struct { + uint64_t seqNumber; + const char* func; + void const* sendBuff; + void* recvBuff; + size_t count; + int root; + const char* datatype; + uint8_t nChannels; + uint8_t nWarps; + const char* algo; + const char* proto; + void* parentGroup; // for backward compatibility with v4 + } coll; + + struct { + const char* func; + void* buff; + const char* datatype; + size_t count; + int peer; + uint8_t nChannels; + void* parentGroup; // for backward compatibility with v4 + } p2p; + + struct { + pid_t pid; // pid of the originating process + uint8_t channelId; // channel id for this proxy operation + int peer; // remote rank for send/recv + int nSteps; // number of steps for this proxy operation + int chunkSize; // amount of data transferred by this proxy operation + int isSend; + } proxyOp; + + struct { + int step; + } proxyStep; + + struct { + uint8_t channelId; + uint64_t pTimer; // start timestamp from GPU globaltimer + } kernelCh; + + struct { + int64_t id; + void* data; + } netPlugin; + }; +} ncclProfilerEventDescr_v5_t; + +typedef union { + struct { + size_t transSize; + } proxyStep; + + struct { + int appendedProxyOps; + } proxyCtrl; + + struct { + void* data; + } netPlugin; + + struct { + uint64_t pTimer; + } kernelCh; +} ncclProfilerEventStateArgs_v5_t; + +typedef struct { + const char* name; + + // init - initialize the profiler plugin + // Input + // - context : opaque profiler context object for separating profiler behavior across comms + // - commId : communicator id + // - commName : user assigned communicator name + // - nNodes : number of nodes in communicator + // - nranks : number of ranks in communicator + // - rank : rank identifier in communicator + // - logfn : logger function + // Output + // - eActivationMask: bitmask of active events set by the plugin + ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn); + + // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset + // Input + // - context: opaque profiler context object + // - eDescr : pointer to ncclProfilerEventDescr_t object + // Output + // - eHandle: return event handle for supplied event descriptor object + ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr); + + // stopEvent - stop/finalize an event inside and event set + // Input + // - eHandle: handle to event object + ncclResult_t (*stopEvent)(void* eHandle); + + // recordEventState - record event state transitions and event attribute updates + // Input + // - eHandle : handle to event object created through startEvent + // - eStateArgs: optional argument used to capture event attribute updates associated with the state transition + // - eState : event state transition + ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs); + + // finalize - finalize the profiler plugin + // Input + // - context: opaque profiler context object + ncclResult_t (*finalize)(void* context); +} ncclProfiler_v5_t; + +#endif diff --git a/src/include/plugin/tuner/tuner_v5.h b/src/include/plugin/tuner/tuner_v5.h new file mode 100644 index 000000000..9e621f842 --- /dev/null +++ b/src/include/plugin/tuner/tuner_v5.h @@ -0,0 +1,87 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef TUNER_V5_H_ +#define TUNER_V5_H_ + +// NVL domain information struct +typedef struct { + int nNvlDomains; // number of NVLink domains + int minRanksPerNvlDomain; // minimum ranks across all NVLink domains + int maxRanksPerNvlDomain; // maximum ranks across all NVLink domains +} ncclNvlDomainInfo_v5_t; + +#define NCCL_NUM_ALGORITHMS_V5 7 // Tree/Ring/CollNet*/PAT +#define NCCL_NUM_PROTOCOLS_V5 3 // Simple/LL/LL128 +#define NCCL_NUM_HW_LINKS_V5 3 +#define NCCL_NUM_COMPCAPS_V5 4 +#define NCCL_NUM_TUNING_SCALES_V5 3 + +typedef struct { + double baseLatencies [NCCL_NUM_ALGORITHMS_V5][NCCL_NUM_PROTOCOLS_V5]; + double hwLatencies [NCCL_NUM_HW_LINKS_V5][NCCL_NUM_ALGORITHMS_V5][NCCL_NUM_PROTOCOLS_V5]; + + double llMaxBws [NCCL_NUM_COMPCAPS_V5][NCCL_NUM_TUNING_SCALES_V5]; + double perChMaxRingLL128Bws [NCCL_NUM_COMPCAPS_V5][NCCL_NUM_TUNING_SCALES_V5]; + double perChMaxTreeLL128Bws [NCCL_NUM_COMPCAPS_V5][NCCL_NUM_TUNING_SCALES_V5]; + double perChMaxTreeBws [NCCL_NUM_COMPCAPS_V5][NCCL_NUM_TUNING_SCALES_V5]; + double perChMaxNVLSTreeBws [NCCL_NUM_COMPCAPS_V5][NCCL_NUM_TUNING_SCALES_V5]; + + +} ncclTunerConstants_v5_t; + +// API to be implemented by external tuner +typedef struct { + // Name of the tuner + const char* name; + + // Initializes tuner states. + // Inputs: + // - commId: communicator identifier + // - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. + // - nNodes: number of nodes in current communicator. + // - logFunction: a logFunction can be useful to integrate logging together with NCCL core. + // - nvlDomainInfo: NVL domain information struct + // Outputs: + // - context: tuner context object + // Input/Output: + // - constants: tuner constants + ncclResult_t (*init)(void** ctx, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, + ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_v5_t* constants); + + // Gets info (algo, protocol, number of ctas and threads) for a given collective. + // Inputs: + // - context: tuner context object + // - collType: collective type , e.g., allreduce, allgather… + // - nBytes: collective size in bytes + // - numPipeOps: number of operations in the group + // - numAlgo: number of algorithms in collCostTable + // - numProto: number of protocols in collCostTable + // - regBuff: can register user buffer + // + // Outputs: + // - nChannels: number of channels (hence SMs) to be used. + // + // InOut: + // - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType. + // NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE). + // + // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the + // default tuning for the given collective. + // Also, the plugin is allowed to not set any output, or set only the + // algorithm and protocol, but not only the algorithm or only the protocol. + // Unset fields will be set automatically by NCCL. + ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes, + int numPipeOps, float** collCostTable, int numAlgo, int numProto, + int regBuff, int* nChannels); + + // Terminates the plugin and cleans up any resources that the plugin allocated. + // context: tuner context object + ncclResult_t (*finalize)(void* context); +} ncclTuner_v5_t; + +#endif diff --git a/src/include/profiler.h b/src/include/profiler.h index 2fb6a7d38..f7f9980b5 100644 --- a/src/include/profiler.h +++ b/src/include/profiler.h @@ -28,12 +28,48 @@ struct ncclProfilerProxy { struct ncclProxyConnector recvProxyConn[MAXCHANNELS]; }; +enum groupApiState { + ncclProfilerGroupApiStartStateReset = 0, + ncclProfilerGroupApiStartStateStarted = 1, + ncclProfilerGroupApiStartStateStopped = 2, +}; + +// Used by the profiler to track state for API events +typedef struct ncclProfilerApiState { + int profilerGroupDepth; + int eActivationMask; + groupApiState state; + void *groupApiEventHandle; + // Tracks the latest API event handles for p2p/collectives + void* p2pApiEventHandle; + void *collApiEventHandle; +} ncclProfilerApiState_t; + +extern __thread ncclProfilerApiState_t ncclProfilerApiState; + extern int ncclProfilerEventMask; // Plugin Init/Finalize Wrappers ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm); ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm); +// Profiler Start/Stop/Record wrappers for ncclGroupStart and ncclGroupEnd API calls +ncclResult_t ncclProfilerStartGroupApiEvent(struct ncclInfo *info, bool isGraphCaptured); +ncclResult_t ncclProfilerStopGroupApiEvent(); +ncclResult_t ncclProfilerRecordGroupApiEventState(ncclProfilerEventState_t eState); + +//Profiler Start/Stop wrappers for P2p API calls +ncclResult_t ncclProfilerStartP2pApiEvent(struct ncclInfo *info, bool isGraphCaptured); +ncclResult_t ncclProfilerStopP2pApiEvent(); + +//Profiler Start/Stop wrappers for Collective API calls +ncclResult_t ncclProfilerStartCollApiEvent(struct ncclInfo *info, bool isGraphCaptured); +ncclResult_t ncclProfilerStopCollApiEvent(); + +// Kernel Launch Start/Stop Event Wrappers +ncclResult_t ncclProfilerStartKernelLaunchEvent(struct ncclKernelPlan* plan, cudaStream_t stream); +ncclResult_t ncclProfilerStopKernelLaunchEvent(struct ncclKernelPlan* plan); + // Profiler Start/Stop Group Wrappers ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan); ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan); diff --git a/src/include/proxy.h b/src/include/proxy.h index 772aa206c..4613ada49 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -69,6 +69,7 @@ struct ncclProxyOp { uint8_t /*ncclDataType_t*/ dtype; uint8_t /*ncclDevRedOp_t*/ redOp; uint8_t /*ncclFunc_t*/ coll; + uint8_t /*ncclFunc_t*/ collAPI; uint8_t /*ncclPattern_t*/ pattern; uint8_t protocol; uint8_t algorithm; @@ -81,6 +82,8 @@ struct ncclProxyOp { int isOneRPN; RingAlgorithm *ringAlgo; union ncclProxyOpSpecifics specifics; + int nChannels; + int nPeers; // Profiler plugin union { @@ -175,11 +178,14 @@ struct ncclProxyArgs { uint8_t /*ncclDevRedOp_t*/ redOp; uint8_t /*ncclPattern_t*/ pattern; uint8_t /*ncclFunc_t*/ coll; + uint8_t /*ncclFunc_t*/ collAPI; uint8_t protocol; uint8_t algorithm; int state; char* sharedBuff[NCCL_STEPS]; int sharedSize[NCCL_STEPS]; + int nChannels; + int nPeers; int idle; @@ -338,6 +344,11 @@ struct ncclProxyState { // Progress thread struct ncclProxyProgressState progressState; + // Network plugin + void* netContext; + ncclNetAttr_t netAttr; + void* collNetContext; + // Profiler plugin void* profilerContext; diff --git a/src/include/register.h b/src/include/register.h index 231cbfc34..edfc722de 100644 --- a/src/include/register.h +++ b/src/include/register.h @@ -1,3 +1,9 @@ +/************************************************************************* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + #ifndef NCCL_REGISTER_H_ #define NCCL_REGISTER_H_ @@ -29,15 +35,6 @@ struct ncclRegNetHandles { struct ncclRegNetHandles* next; }; -struct ncclSymRegTask { - struct ncclSymRegTask *next; - void* buff; - size_t baseSize; - CUmemGenericAllocationHandle memHandle; - struct ncclReg* regHandle; - size_t alignment; -}; - struct ncclReg { // common attributes uintptr_t begAddr, endAddr; // page aligned @@ -58,10 +55,6 @@ struct ncclReg { // general ipc reg struct ncclPeerRegIpcAddr regIpcAddrs; struct ncclIpcRegInfo* ipcInfos[NCCL_MAX_LOCAL_RANKS]; - // symmetric reg - void* baseSymPtr; - size_t symSize; - int winFlags; }; struct ncclRegCache { @@ -70,14 +63,9 @@ struct ncclRegCache { uintptr_t pageSize; }; -struct ncclWindow { - struct ncclReg* handle; -}; - ncclResult_t ncclRegCleanup(struct ncclComm* comm); ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle); ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle); ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid); -ncclResult_t ncclCommSymmetricRegisterInternal(struct ncclComm* comm, void* buff, size_t baseSize, size_t alignment, CUmemGenericAllocationHandle memHandle, struct ncclReg* regHandle); #endif diff --git a/src/include/register_inline.h b/src/include/register_inline.h index fb7641b13..76181c4ac 100644 --- a/src/include/register_inline.h +++ b/src/include/register_inline.h @@ -1,3 +1,9 @@ +/************************************************************************* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + #ifndef NCCL_REGISTER_INLINE_H_ #define NCCL_REGISTER_INLINE_H_ @@ -18,16 +24,5 @@ static inline ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, } } -static inline ncclResult_t ncclRegFindSymmetric(struct ncclComm* comm, const void* data, size_t size, void** symPtr, struct ncclReg** outReg) { - struct ncclReg* regRecord = NULL; - *symPtr = NULL; - *outReg = NULL; - NCCLCHECK(ncclRegFind(comm, data, size, ®Record)); - if (regRecord && regRecord->baseSymPtr) { - *symPtr = (void*)((uintptr_t)regRecord->baseSymPtr + (uintptr_t)data - (uintptr_t)regRecord->begAddr); - *outReg = regRecord; - } - return ncclSuccess; -} #endif diff --git a/src/include/scheduler.h b/src/include/scheduler.h new file mode 100644 index 000000000..9ee9bb232 --- /dev/null +++ b/src/include/scheduler.h @@ -0,0 +1,17 @@ +/************************************************************************* + * Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_SCHEDULER_H_ +#define NCCL_SCHEDULER_H_ + +#include "nccl.h" +#include "comm.h" +#include "sym_kernels.h" + +ncclResult_t ncclMakeSymmetricTaskList(struct ncclComm* comm, struct ncclTaskColl* task, struct ncclIntruQueue* symTaskQueue, struct ncclTaskColl** remainTasksHead); +ncclResult_t ncclSymmetricTaskScheduler(struct ncclComm* comm, struct ncclIntruQueue* symTaskQueue, struct ncclKernelPlan* plan); + +#endif // NCCL_SCHEDULER_H_ diff --git a/src/include/shm.h b/src/include/shm.h index 223d87346..b944241a4 100644 --- a/src/include/shm.h +++ b/src/include/shm.h @@ -1,3 +1,9 @@ +/************************************************************************* + * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + #ifndef NCCL_SHM_H_ #define NCCL_SHM_H_ diff --git a/src/include/shmutils.h b/src/include/shmutils.h index 097b4c657..199b9f717 100644 --- a/src/include/shmutils.h +++ b/src/include/shmutils.h @@ -15,8 +15,8 @@ ncclResult_t ncclShmClose(ncclShmHandle_t handle); ncclResult_t ncclShmUnlink(ncclShmHandle_t handle); struct ncclShmemCollBuff { - volatile size_t *cnt[2]; - volatile void *ptr[2]; + size_t *cnt[2]; + void *ptr[2]; int round; size_t maxTypeSize; }; diff --git a/src/include/sym_kernels.h b/src/include/sym_kernels.h new file mode 100644 index 000000000..4e742eff7 --- /dev/null +++ b/src/include/sym_kernels.h @@ -0,0 +1,112 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_SYM_KERNELS_H_ +#define NCCL_SYM_KERNELS_H_ +#include "nccl.h" +#include "nccl_device.h" +#include "nccl_common.h" +#include "device.h" + +//////////////////////////////////////////////////////////////////////////////// +// ncclSymk[Foo]: Kernels built on the device API + +#define NCCL_SYM_KERNEL_CELL_SIZE 1024 // no less than 16 bytes minimal cell size + +constexpr int ncclSymkMaxBlocks = 64; +constexpr int ncclSymkMaxThreads = 512; +constexpr int ncclSymkLLMaxEltSize = 8; + +constexpr __host__ __device__ int ncclSymkLLMaxSlots(int eltSize = ncclSymkLLMaxEltSize) { + return ncclSymkMaxThreads*ncclSymkLLMaxEltSize/eltSize; +} + +enum ncclSymkKernelId { + ncclSymkKernelId_AllReduce_AGxLL_R, + ncclSymkKernelId_AllReduce_AGxLLMC_R, + ncclSymkKernelId_AllReduce_RSxLD_AGxST, + ncclSymkKernelId_AllReduce_RSxLDMC_AGxSTMC, + ncclSymkKernelId_AllReduce_RSxNet_ARxMC_AGxNet, + + ncclSymkKernelId_AllGather_LL, + ncclSymkKernelId_AllGather_LLMC, + ncclSymkKernelId_AllGather_ST, + ncclSymkKernelId_AllGather_STMC, + + ncclSymkKernelId_ReduceScatter_LL, + ncclSymkKernelId_ReduceScatter_LD, + ncclSymkKernelId_ReduceScatter_LDMC, + + ncclSymkKernelId_Count +}; + +struct ncclSymkDevComm { + struct ncclDevComm devComm; + struct ncclLLA2AHandle lsaLLA2A; +}; + +struct ncclSymkState { + bool initialized; + struct ncclSymkDevComm kcomm; +}; + +struct ncclSymkChannelWorkRange { + uint16_t workHi; // inclusive index of my ending work + uint16_t fracHi; // 16-bit fraction in (0.0, 1.0] indicating where my part ends +}; + +// 16 bytes aligned +struct alignas(16) ncclSymkDevWork { + uint64_t redOpArg; // must be collectively uniform + size_t nElts; + struct ncclWindow_vidmem* inputWin, *outputWin; + size_t inputOff, outputOff; // these = origUserOffset + cbdPartOffset + uint64_t rootRank; + uint64_t sChannelId:16, nChannels:16, padding:32; +}; + +struct alignas(16) ncclSymkDevWorkArgs { + struct ncclSymkDevComm kcomm; + int nMaxChannels; + // starting of channelWorkRange will be aligned to 16 bytes + // channelWorkRange[nChannels]; + // ncclSymDevWork[nWorks]; + // aux functions + __host__ static constexpr size_t calcArgsSize(int nChannels, int nWorks) { + return alignUp(sizeof(struct ncclSymkDevWorkArgs), 16) + alignUp(nChannels * sizeof(struct ncclSymkChannelWorkRange), 16) + nWorks * sizeof(struct ncclSymkDevWork); + } + __host__ __device__ struct ncclSymkChannelWorkRange* getWorkRange() const { + return (struct ncclSymkChannelWorkRange*)((uint8_t*)this + alignUp(sizeof(struct ncclSymkDevWorkArgs), 16)); + } + __host__ __device__ struct ncclSymkDevWork* getWorks(int nChannels) const { + return (struct ncclSymkDevWork*)((uint8_t*)this->getWorkRange() + alignUp(nChannels * sizeof(struct ncclSymkChannelWorkRange), 16)); + } +}; + +union ncclSymkDevWorkArgs4K { + struct ncclSymkDevWorkArgs args; + char buf4K[4096]; +}; + +// We assume ncclComm contains a field: `ncclSymkState symkState` +ncclResult_t ncclSymkInitOnce(struct ncclComm* comm); +ncclResult_t ncclSymkFinalize(struct ncclComm* comm); + +bool ncclSymkAvailable(struct ncclComm* comm, ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, + ncclDataType_t ty, size_t nElts); +ncclResult_t ncclSymkPickKernel(struct ncclComm* comm, ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, + size_t nEltsTotal, size_t nEltsMax, int nWorks, + float* estTimeUs, ncclSymkKernelId* kernelId, int* nBlocks, int* nWarps); + +ncclResult_t ncclSymkMakeDevWork(struct ncclComm* comm, struct ncclTaskColl* task, struct ncclSymkDevWork* outDevWork); + +// Generated by src/device/symmetric/generate.py +extern int const ncclSymkKernelCount; +extern void* const ncclSymkKernelList[]; +void* ncclSymkGetKernelPtr(ncclSymkKernelId kernelId, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty); +const char* ncclSymkKernelIdToString(int kernelId); + +#endif diff --git a/src/include/symmetric.h b/src/include/symmetric.h deleted file mode 100644 index 7a189bcca..000000000 --- a/src/include/symmetric.h +++ /dev/null @@ -1,90 +0,0 @@ -#ifndef NCCL_DEVICE_SYMMETRIC_H_ -#define NCCL_DEVICE_SYMMETRIC_H_ - -#include "nccl.h" -#include "nccl_common.h" -#include "bitops.h" - -constexpr int ncclSymMaxBlocks = 64; -constexpr int ncclSymMaxThreads = 512; -constexpr int ncclSymLLMaxEltSize = 64; - -constexpr __host__ __device__ int ncclSymLLMaxSlots(int eltSize = ncclSymLLMaxEltSize) { - return ncclSymMaxThreads*ncclSymLLMaxEltSize/eltSize; -} - -constexpr __host__ __device__ int ncclSymLLEpochSize(int nRanks) { - return /*LL Overhead*/2 * maxval(ncclSymMaxThreads*nRanks*8, ncclSymLLMaxSlots(ncclSymLLMaxEltSize)*ncclSymLLMaxEltSize); -} - -struct alignas(16) ncclSymDevBase { - uint32_t llEpoch[ncclSymMaxBlocks]; - uint32_t barEpochMc[ncclSymMaxBlocks], barEpochUc[ncclSymMaxBlocks]; - uint32_t barInboxMc[ncclSymMaxBlocks]; - uint32_t barInboxPerPeer[]; - - static constexpr size_t size(int nRanks) { - return sizeof(ncclSymDevBase) + - alignUp(ncclSymMaxBlocks*nRanks*sizeof(uint32_t), 16) + - ncclSymMaxBlocks * /*epochs=*/2 * ncclSymLLEpochSize(nRanks); - } -}; - -static __device__ uint4* ncclSymDevBase_getLLBuf(struct ncclSymDevBase* base, int nRanks, int block, uint32_t epoch) { - // Get pointer to buffer trailing the header struct. - char* ans = (char*)(base + 1); - // Skip over barInboxPerPeer[] - ans += alignUp(ncclSymMaxBlocks*nRanks*sizeof(uint32_t), 16); - // Skip to our block - int epochSize = ncclSymLLEpochSize(nRanks); - ans += block * /*epochs=*/2 * epochSize; - ans += (epoch & 1)*epochSize; - return (uint4*)ans; -} - -struct ncclSymDevComm { - ncclSymDevBase* base; - ncclSymDevBase* baseMc; - uint32_t stride4G; - int nRanks, rank; - uint32_t nRanks_rcp32; // idivRcp32(nRanks) -}; - -struct alignas(16) ncclSymDevArgs { - struct ncclSymDevComm comm; - int rootRank; - uint64_t redOpArg; // must be collectively uniform - size_t nElts; - char* input; - char* output; -}; - -enum ncclSymKernelId { - ncclSymKernelId_AllReduce_AGxLL_R, - ncclSymKernelId_AllReduce_AGxLLMC_R, - ncclSymKernelId_AllReduce_RSxLD_AGxST, - ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC, - - ncclSymKernelId_AllGather_LL, - ncclSymKernelId_AllGather_LLMC, - ncclSymKernelId_AllGather_ST, - ncclSymKernelId_AllGather_STMC, - - ncclSymKernelId_ReduceScatter_LL, - ncclSymKernelId_ReduceScatter_LD, - ncclSymKernelId_ReduceScatter_LDMC, - - ncclSymKernelId_Count -}; - -bool ncclSymImplemented(ncclFunc_t fn, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty); - -ncclResult_t ncclSymPickKernel(struct ncclComm* comm, ncclFunc_t fn, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, size_t nElts, float* estTimeUs, ncclSymKernelId* kernelId, int* nBlocks, int* nWarps); - -// Generated by src/device/symmetric/generate.py -extern int const ncclSymKernelCount; -extern void* const ncclSymKernelList[]; -void* ncclSymGetKernelPtr(ncclSymKernelId kernelId, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty); -const char* ncclSymKernelIdToString(int kernelId); - -#endif diff --git a/src/include/transport.h b/src/include/transport.h index a9971a74f..39e479e24 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -162,13 +162,9 @@ ncclResult_t ncclRegisterCollBuffers(struct ncclComm* comm, struct ncclTaskColl* ncclResult_t ncclRegisterCollNvlsBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue* cleanupQueue, bool* regNeedConnect); ncclResult_t ncclNvlsRegResourcesQuery(struct ncclComm* comm, struct ncclTaskColl* info, int* recChannels); -ncclResult_t ncclIpcSymmetricInit(struct ncclComm* comm); -ncclResult_t ncclIpcSymmetricMap(struct ncclComm* comm, size_t offset, size_t size, CUmemGenericAllocationHandle memHandle, void** symPtr); -ncclResult_t ncclIpcSymmetricFree(struct ncclComm* comm, size_t size, void* symPtr); -ncclResult_t ncclIpcSymmetricFinalize(struct ncclComm* comm); -ncclResult_t ncclNvlsSymmetricInit(struct ncclComm* comm); -ncclResult_t ncclNvlsSymmetricMap(struct ncclComm* comm, size_t offset, size_t ucsize, void* ucaddr); -ncclResult_t ncclNvlsSymmetricFree(struct ncclComm* comm, size_t ucsize, void* ucaddr); -ncclResult_t ncclNvlsSymmetricFinalize(struct ncclComm* comm); +#if CUDART_VERSION >= 12010 +ncclResult_t ncclNvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop, int rank, unsigned int nranks, CUmemGenericAllocationHandle *mcHandle, char *shareableHandle); +ncclResult_t ncclNvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int rank, CUmemGenericAllocationHandle *mcHandle); +#endif #endif diff --git a/src/include/utils.h b/src/include/utils.h index bfed2722c..46389985f 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -551,4 +551,6 @@ T* ncclIntruQueueMpscAbandon(ncclIntruQueueMpsc* me) { return head; } } + +ncclResult_t ncclBitsToString(uint32_t bits, uint32_t mask, const char* (*toStr)(int), char *buf, size_t bufLen, const char *wildcard); #endif diff --git a/src/init.cc b/src/init.cc index af784c02d..ebf942c02 100644 --- a/src/init.cc +++ b/src/init.cc @@ -27,10 +27,14 @@ #include #include #include +#include #include #include "param.h" #include "nvtx_payload_schemas.h" #include "utils.h" +#include +#include "ce_coll.h" +#include "nvtx.h" #define STR2(v) #v #define STR(v) STR2(v) @@ -54,6 +58,9 @@ NCCL_PARAM(WinEnable, "WIN_ENABLE", 1); NCCL_PARAM(CollnetEnable, "COLLNET_ENABLE", NCCL_CONFIG_UNDEF_INT); NCCL_PARAM(CtaPolicy, "CTA_POLICY", NCCL_CONFIG_UNDEF_INT); NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", NCCL_CONFIG_UNDEF_INT); +NCCL_PARAM(SetCpuStackSize, "SET_CPU_STACK_SIZE", 1); + +extern int64_t ncclParamSingleProcMemRegEnable(); static ncclResult_t commReclaim(ncclComm_t comm); @@ -70,11 +77,50 @@ ncclResult_t initGdrCopy() { return ncclSuccess; } +// The default Linux stack size (8MB) is safe. +#define SAFE_STACK_SIZE (8192*1024) + +static ncclResult_t setCpuStackSize() { + if (ncclParamSetCpuStackSize() != 0) { + // Query the stack size used for newly launched threads. + pthread_attr_t attr; + size_t stackSize; + PTHREADCHECK(pthread_attr_init(&attr), "pthread_attr_init"); + PTHREADCHECK(pthread_attr_getstacksize(&attr, &stackSize), "pthread_attr_getstacksize"); + + if (stackSize < SAFE_STACK_SIZE) { + // GNU libc normally uses RLIMIT_STACK as the default pthread stack size, unless it's set to "unlimited" -- + // in that case a fallback value of 2MB (!) is used. + + // Query the actual resource limit so that we can distinguish between the settings of 2MB and unlimited. + struct rlimit stackLimit; + char buf[30]; + SYSCHECK(getrlimit(RLIMIT_STACK, &stackLimit), "getrlimit"); + if (stackLimit.rlim_cur == RLIM_INFINITY) + strcpy(buf, "unlimited"); + else + snprintf(buf, sizeof(buf), "%ldKB", stackLimit.rlim_cur/1024); + INFO(NCCL_INIT|NCCL_ENV, "Stack size limit (%s) is unsafe; will use %dKB for newly launched threads", + buf, SAFE_STACK_SIZE/1024); + + // Change the default pthread stack size (via a nonportable API, which will become necessary if we switch + // to C++ threads). + PTHREADCHECK(pthread_attr_setstacksize(&attr, SAFE_STACK_SIZE), "pthread_attr_setstacksize"); + PTHREADCHECK(pthread_setattr_default_np(&attr), "pthread_setattr_default_np"); + } + + PTHREADCHECK(pthread_attr_destroy(&attr), "pthread_attr_destroy"); + } + + return ncclSuccess; +} + static ncclResult_t initResult = ncclSuccess; -static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; +static std::once_flag initOnceFlag; static void initOnceFunc() { initEnv(); + setCpuStackSize(); initGdrCopy(); // Always initialize bootstrap network NCCLCHECKGOTO(bootstrapNetInit(), initResult, exit); @@ -84,7 +130,7 @@ exit:; } static ncclResult_t ncclInit() { - pthread_once(&initOnceControl, initOnceFunc); + std::call_once(initOnceFlag, initOnceFunc); return initResult; } @@ -180,10 +226,12 @@ static ncclResult_t commFree(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; - if (comm->symmetricSupport && comm->symDevComm.base) { - NCCLCHECK(ncclCommSymmetricFreeInternal(comm, comm->baseUCSymPtr + comm->rank * comm->baseStride)); - } + NCCLCHECK(ncclCeFinalize(comm)); + if (comm->symmetricSupport) { + NCCLCHECK(ncclSymkFinalize(comm)); + NCCLCHECK(ncclDevrFinalize(comm)); + } NCCLCHECK(ncclRasCommFini(comm)); /* in commReclaim, we have guaranteed only last rank which calls ncclCommDestroy() will @@ -263,10 +311,6 @@ static ncclResult_t commFree(ncclComm_t comm) { NCCLCHECK(ncclRegCleanup(comm)); - if (comm->symmetricSupport) { - NCCLCHECK(ncclNvlsSymmetricFinalize(comm)); - NCCLCHECK(ncclIpcSymmetricFinalize(comm)); - } INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - %s COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId, abort ? "Abort" : "Destroy"); commPoison(comm); // poison comm before free to avoid comm reuse. @@ -414,6 +458,7 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in ncclIntruQueueMpscConstruct(&comm->callbackQueue); ncclIntruQueueConstruct(&comm->legacyRegCleanupQueue); + ncclIntruQueueConstruct(&comm->ceInitTaskQueue); comm->regCache.pageSize = sysconf(_SC_PAGESIZE); @@ -436,8 +481,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in static ncclResult_t devCommSetup(ncclComm_t comm) { ncclResult_t ret = ncclSuccess; int nRanks = comm->nRanks; - struct ncclDevCommAndChannels tmpCommAndChans; - struct ncclDevCommAndChannels *devCommAndChans = NULL; + struct ncclKernelCommAndChannels tmpCommAndChans; + struct ncclKernelCommAndChannels *devCommAndChans = NULL; struct ncclNvmlCCStatus ccStatus; bool ccEnable; cudaStream_t deviceStream; @@ -465,7 +510,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { comm->workArgsBytes = std::min(ncclParamWorkArgsBytes(), ncclMaxKernelArgsSize(comm->cudaArch)); memset(&ccStatus, 0, sizeof(ccStatus)); - ccEnable = (ncclSuccess == ncclNvmlGetCCStatus(&ccStatus)) && (ccStatus.CCEnabled || ccStatus.multiGpuProtectedPCIE); + ccEnable = (ncclSuccess == ncclNvmlGetCCStatus(&ccStatus)) && (ccStatus.CCEnabled || ccStatus.multiGpuProtectedPCIE || ccStatus.multiGpuNVLE); if (ccEnable) { comm->workFifoBytes = 0; } else { @@ -582,14 +627,28 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u info->fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED; (void) ncclNvmlDeviceGetGpuFabricInfoV(nvmlDev, &info->fabricInfo); if (info->fabricInfo.state != NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) { + unsigned long uuid0 = 0; + unsigned long uuid1 = 0; if (ncclParamMNNVLUUID() != -1) { - ((long*)&info->fabricInfo.clusterUuid)[0] = ncclParamMNNVLUUID(); - ((long*)&info->fabricInfo.clusterUuid)[1] = ncclParamMNNVLUUID(); + unsigned long temp_uuid0 = (unsigned long)ncclParamMNNVLUUID(); + unsigned long temp_uuid1 = (unsigned long)ncclParamMNNVLUUID(); + memcpy(info->fabricInfo.clusterUuid, &temp_uuid0, sizeof(temp_uuid0)); + memcpy(info->fabricInfo.clusterUuid + sizeof(temp_uuid0), &temp_uuid1, sizeof(temp_uuid1)); } - if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId(); + memcpy(&uuid0, info->fabricInfo.clusterUuid, sizeof(uuid0)); + memcpy(&uuid1, info->fabricInfo.clusterUuid + sizeof(uuid0), sizeof(uuid1)); + if (ncclParamMNNVLCliqueId() == -2) { + nvmlPlatformInfo_t platformInfo = { 0 }; + NCCLCHECK(ncclNvmlDeviceGetPlatformInfo(nvmlDev, &platformInfo)); + INFO(NCCL_INIT, "MNNVL rack serial %s slot %d tray %d hostId %d peerType %d moduleId %d", + platformInfo.chassisSerialNumber, platformInfo.slotNumber, platformInfo.trayIndex, + platformInfo.hostId, platformInfo.peerType, platformInfo.moduleId); + // Use a hash of the Rack serial number to partition the NVLD clique + info->fabricInfo.cliqueId = getHash(platformInfo.chassisSerialNumber, sizeof(platformInfo.chassisSerialNumber)); + } else if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId(); INFO(NCCL_INIT, "MNNVL busId 0x%lx fabric UUID %lx.%lx cliqueId 0x%x state %d healthMask 0x%x", info->busId, - ((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1], + uuid0, uuid1, info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask); } } @@ -670,6 +729,18 @@ NCCL_PARAM(MNNVLEnable, "MNNVL_ENABLE", 2); #define TIMER_INIT_ALLOC 7 #define TIMERS_INIT_COUNT 8 +static ncclResult_t initNvlDomainInfo(struct ncclComm* comm) { + // Initialize NVLink domain info + comm->nvlDomainInfo.nNvlDomains = comm->nNodes; + comm->nvlDomainInfo.minRanksPerNvlDomain = comm->minLocalRanks; + comm->nvlDomainInfo.maxRanksPerNvlDomain = comm->maxLocalRanks; + + TRACE(NCCL_INIT, "NVLink domains: %d domains, min ranks per domain: %d, max ranks per domain: %d", + comm->nNodes, comm->nvlDomainInfo.minRanksPerNvlDomain, comm->nvlDomainInfo.maxRanksPerNvlDomain); + + return ncclSuccess; +} + static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]) { // We use 2 AllGathers // 1. { peerInfo, comm, compCap} @@ -781,6 +852,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p // Buffer Registration is not supported with MNNVL if (comm->MNNVL) comm->nvlsRegSupport = 0; + else if (ncclParamSingleProcMemRegEnable()) comm->nvlsRegSupport = 1; TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d", rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0); @@ -969,10 +1041,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks; comm->nodeRanks[node].localRanks++; } + comm->minLocalRanks = INT_MAX; // Allocate ranks arrays for each node for (int n=0; nnNodes; n++) { NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks), ret, fail); comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks); + comm->minLocalRanks = std::min(comm->minLocalRanks, comm->nodeRanks[n].localRanks); comm->nodeRanks[n].localRanks = 0; } // And fill the ranks arrays @@ -985,6 +1059,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p comm->localRank = comm->rankToLocalRank[rank]; comm->localRanks = comm->nodeRanks[comm->node].localRanks; + NCCLCHECKGOTO(initNvlDomainInfo(comm), ret, fail); + TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d", rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]); if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) { @@ -1227,6 +1303,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels); // Compute time models for algorithm and protocol combinations + NCCLCHECKGOTO(ncclTopoInitTunerConstants(comm), ret, fail); + NCCLCHECKGOTO(ncclTunerPluginLoad(comm), ret, fail); + if (comm->tuner) { + NCCLCHECK(comm->tuner->init(&comm->tunerContext, comm->commHash, comm->nRanks, comm->nNodes, ncclDebugLog, &comm->nvlDomainInfo, &comm->tunerConstants)); + } NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail); INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer); @@ -1248,7 +1329,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p } comm->symmetricSupport = comm->isAllDirectP2p && comm->nNodes == 1 && ncclParamWinEnable() && ncclCuMemEnable(); - comm->baseStride = 0; + comm->devrState.bigSize = 0; + + comm->ceColl.baseUCSymReadyPtr = NULL; + comm->ceColl.baseUCSymComplPtr = NULL; // Call devCommSetup before the last barrier, making sure we don't have a thread running in front and starting to // launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock. @@ -1287,6 +1371,10 @@ NCCL_PARAM(MaxCTAs, "MAX_CTAS", NCCL_CONFIG_UNDEF_INT); NCCL_PARAM(MinCTAs, "MIN_CTAS", NCCL_CONFIG_UNDEF_INT); #define NCCL_MAX_CGA_CLUSTER_SIZE 8 +NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", NCCL_CONFIG_UNDEF_INT); +NCCL_PARAM(NvlinkUtilCentricSchedEnable, "NVLINK_UTIL_CENTRIC_SCHED_ENABLE", 0); + + #define NCCL_COMMINIT_FUNCNAME_LEN 128 struct ncclCommInitRankAsyncJob { struct ncclAsyncJob base; @@ -1416,15 +1504,15 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { // Negative color does not create a new comm object. We needed to take part in the allgather, but we're done now. if (job->color == NCCL_SPLIT_NOCOLOR) goto exit; } - timers[TIMER_INIT_ALLOC] = clockNano(); - NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail); - timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC]; // child hash obtained from (parent hash, split count, color) uint64_t hacc[2] = {1, 1}; eatHash(hacc, &job->parent->commHash); eatHash(hacc, &job->splitCount); eatHash(hacc, &job->color); comm->commHash = digestHash(hacc); + timers[TIMER_INIT_ALLOC] = clockNano(); + NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail); + timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC]; INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d- Init START", job->funcName, comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key); timers[TIMER_INIT_BOOTSTRAP] = clockNano(); @@ -1433,11 +1521,11 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { // debug info, no commId was used commIdHash = 0; } else { + // obtain a unique hash using the first commId + comm->commHash = commIdHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES); timers[TIMER_INIT_ALLOC] = clockNano(); NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail); timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC]; - // obtain a unique hash using the first commId - comm->commHash = commIdHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES); INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", job->funcName, comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash); timers[TIMER_INIT_BOOTSTRAP] = clockNano(); @@ -1447,10 +1535,6 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { comm->cudaArch = cudaArch; NCCLCHECKGOTO(initTransportsRank(comm, job->parent, timers), res, fail); - NCCLCHECKGOTO(ncclTunerPluginLoad(comm), res, fail); - if (comm->tuner) { - NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog, &comm->tunerContext)); - } // update communicator state comm->initState = ncclSuccess; @@ -1511,8 +1595,10 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) { int ctaPolicyEnv; int shrinkShareEnv; int nvlsCTAsEnv; + int nChannelsPerNetPeerEnv; + int nvlinkUtilCentricSchedEnableEnv; - /* override configuration from env variable. */ + /* override configuration with env variable. */ blockingEnv = ncclParamCommBlocking(); if (blockingEnv == 0 || blockingEnv == 1) comm->config.blocking = blockingEnv; @@ -1541,6 +1627,23 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) { comm->config.maxCTAs = maxCTAsEnv; } + /* override configuration with env variable. */ + nChannelsPerNetPeerEnv = ncclParamNChannelsPerNetPeer(); + if (nChannelsPerNetPeerEnv != NCCL_CONFIG_UNDEF_INT) { + if (nChannelsPerNetPeerEnv <= 0) + INFO(NCCL_ENV, "NCCL_NCHANNELS_PER_NET_PEER %d is too low, leaving it set at %d", nChannelsPerNetPeerEnv, comm->config.nChannelsPerNetPeer); + else + comm->config.nChannelsPerNetPeer = nChannelsPerNetPeerEnv; + } + + nvlinkUtilCentricSchedEnableEnv = ncclParamNvlinkUtilCentricSchedEnable(); + if (nvlinkUtilCentricSchedEnableEnv != NCCL_CONFIG_UNDEF_INT) { + if (nvlinkUtilCentricSchedEnableEnv != 0 && nvlinkUtilCentricSchedEnableEnv != 1) + INFO(NCCL_ENV, "NCCL_NVLINK_UTIL_CENTRIC_SCHED_ENABLE %d is not valid, leaving it set at %d", nvlinkUtilCentricSchedEnableEnv, comm->config.nvlinkCentricSched); + else + comm->config.nvlinkCentricSched = nvlinkUtilCentricSchedEnableEnv; + } + envNetName = ncclGetEnv("NCCL_NET"); if (envNetName) tmpNetName = envNetName; @@ -1608,7 +1711,7 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) { comm->config.collnetEnable = 0; } - if (comm->config.CTAPolicy < NCCL_CTA_POLICY_DEFAULT || comm->config.CTAPolicy > NCCL_CTA_POLICY_EFFICIENCY) { + if (comm->config.CTAPolicy < NCCL_CTA_POLICY_DEFAULT || comm->config.CTAPolicy > NCCL_CTA_POLICY_ZERO) { INFO(NCCL_ENV, "CTAPolicy %d is not a valid value, set it to %d", comm->config.CTAPolicy, NCCL_CTA_POLICY_DEFAULT); comm->config.CTAPolicy = NCCL_CTA_POLICY_DEFAULT; } @@ -1617,6 +1720,7 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) { INFO(NCCL_ENV, "nvlsCTAs %d is not a valid value, NCCL will decide the default value automatically", comm->config.nvlsCTAs); comm->config.nvlsCTAs = NCCL_CONFIG_UNDEF_INT; } + return ret; } @@ -1668,6 +1772,10 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { internalConfigPtr->shrinkShare = defaultConfig.shrinkShare; internalConfigPtr->nvlsCTAs = defaultConfig.nvlsCTAs; } + if (internalConfigPtr->version < NCCL_VERSION(2, 28, 0)) { + internalConfigPtr->nChannelsPerNetPeer = defaultConfig.nChannelsPerNetPeer; + internalConfigPtr->nvlinkCentricSched = defaultConfig.nvlinkCentricSched; + } } /* check input config attributes, -1 means user-undefined and we should use default value from NCCL. */ @@ -1706,7 +1814,7 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { } if (internalConfigPtr->CTAPolicy != NCCL_CONFIG_UNDEF_INT && (internalConfigPtr->CTAPolicy < NCCL_CTA_POLICY_DEFAULT || - internalConfigPtr->CTAPolicy > NCCL_CTA_POLICY_EFFICIENCY)) { + internalConfigPtr->CTAPolicy > NCCL_CTA_POLICY_ZERO)) { WARN("Invalid config policy attribute value %d", internalConfigPtr->CTAPolicy); ret = ncclInvalidArgument; goto fail; @@ -1724,6 +1832,18 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { goto fail; } + if (internalConfigPtr->nChannelsPerNetPeer != NCCL_CONFIG_UNDEF_INT && (internalConfigPtr->nChannelsPerNetPeer <= 0 || internalConfigPtr->nChannelsPerNetPeer > MAXCHANNELS)) { + WARN("Invalid config nChannelsPerNetPeer attribute value %d", internalConfigPtr->nChannelsPerNetPeer); + ret = ncclInvalidArgument; + goto fail; + } + + if (internalConfigPtr->nvlinkCentricSched != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->nvlinkCentricSched != 0 && internalConfigPtr->nvlinkCentricSched != 1) { + WARN("Invalid config nvlinkCentricSched attribute value %d", internalConfigPtr->nvlinkCentricSched); + ret = ncclInvalidArgument; + goto fail; + } + /* default config value can be tuned on different platform. */ NCCL_CONFIG_DEFAULT(internalConfigPtr, blocking, NCCL_CONFIG_UNDEF_INT, 1, "Blocking", "%d"); NCCL_CONFIG_DEFAULT(internalConfigPtr, cgaClusterSize, NCCL_CONFIG_UNDEF_INT, 4, "CGA cluster size", "%d"); @@ -1737,6 +1857,9 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { NCCL_CONFIG_DEFAULT(internalConfigPtr, CTAPolicy, NCCL_CONFIG_UNDEF_INT, NCCL_CTA_POLICY_DEFAULT, "CTA policy flags", "%d"); NCCL_CONFIG_DEFAULT(internalConfigPtr, shrinkShare, NCCL_CONFIG_UNDEF_INT, 0, "shrinkShare", "%d"); NCCL_CONFIG_DEFAULT(internalConfigPtr, nvlsCTAs, NCCL_CONFIG_UNDEF_INT, NCCL_CONFIG_UNDEF_INT, "nvlsCTAs", "%d"); + NCCL_CONFIG_DEFAULT(internalConfigPtr, nChannelsPerNetPeer, NCCL_CONFIG_UNDEF_INT, + NCCL_CONFIG_UNDEF_INT, "nChannelsPerNetPeer", "%d"); + NCCL_CONFIG_DEFAULT(internalConfigPtr, nvlinkCentricSched, NCCL_CONFIG_UNDEF_INT, 0, "nvlinkCentricSched", "%d"); /* assign config to communicator */ comm->config.blocking = internalConfigPtr->blocking; @@ -1751,6 +1874,8 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) { comm->config.CTAPolicy = internalConfigPtr->CTAPolicy; comm->config.shrinkShare = internalConfigPtr->shrinkShare; comm->config.nvlsCTAs = internalConfigPtr->nvlsCTAs; + comm->config.nChannelsPerNetPeer = internalConfigPtr->nChannelsPerNetPeer; + comm->config.nvlinkCentricSched = internalConfigPtr->nvlinkCentricSched; NCCLCHECKGOTO(envConfigOverride(comm), ret, fail); exit: @@ -1779,8 +1904,8 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId NCCLCHECKGOTO(ncclInit(), res, fail); if (ncclDebugLevel > NCCL_LOG_WARN || (ncclDebugLevel != NCCL_LOG_NONE && myrank == 0)) { - static pthread_once_t once = PTHREAD_ONCE_INIT; - pthread_once(&once, showVersion); + static std::once_flag once; + std::call_once(once, showVersion); } // Make sure the CUDA runtime is initialized. CUDACHECKGOTO(cudaFree(NULL), res, fail); @@ -2054,7 +2179,7 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) { static ncclResult_t commCleanup(ncclComm_t comm) { CUDACHECK(cudaSetDevice(comm->cudaDev)); if (comm->tuner != NULL) { - NCCLCHECK(comm->tuner->destroy(comm->tunerContext)); + NCCLCHECK(comm->tuner->finalize(comm->tunerContext)); NCCLCHECK(ncclTunerPluginUnload(comm)); } NCCLCHECK(commFree(comm)); @@ -2158,7 +2283,7 @@ static ncclResult_t commReclaim(struct ncclAsyncJob* job_) { NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm); ncclResult_t ncclCommDestroy(ncclComm_t comm) { if (comm == NULL) { - NVTX3_FUNC_RANGE_IN(nccl_domain); + NCCL_NVTX3_FUNC_RANGE; return ncclSuccess; } @@ -2210,6 +2335,10 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) { if (comm == NULL) { return ncclSuccess; } + + INFO(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx - Abort START", + comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId); + NCCLCHECK(ncclGroupStartInternal()); // Ask anything that might still be running on the device to quit NCCLCHECK(setCommAbortFlags(comm,1)); @@ -2418,7 +2547,7 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) { NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count); ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) { - NVTX3_FUNC_RANGE_IN(nccl_domain); + NCCL_NVTX3_FUNC_RANGE; NCCLCHECK(CommCheck(comm, "CommCount", "comm")); NCCLCHECK(PtrCheck(count, "CommCount", "count")); @@ -2432,7 +2561,7 @@ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) { NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid); ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) { - NVTX3_FUNC_RANGE_IN(nccl_domain); + NCCL_NVTX3_FUNC_RANGE; NCCLCHECK(CommCheck(comm, "CommCuDevice", "comm")); NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid")); @@ -2445,7 +2574,7 @@ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) { NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank); ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) { - NVTX3_FUNC_RANGE_IN(nccl_domain); + NCCL_NVTX3_FUNC_RANGE; NCCLCHECK(CommCheck(comm, "CommUserRank", "comm")); NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank")); diff --git a/src/init_nvtx.cc b/src/init_nvtx.cc index 1cb1277d2..b7005123b 100644 --- a/src/init_nvtx.cc +++ b/src/init_nvtx.cc @@ -1,5 +1,12 @@ +/************************************************************************* + * Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + #include "nccl.h" #include "nvtx.h" +#include "param.h" static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = { {"Sum", ncclSum, 0}, @@ -9,9 +16,15 @@ static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = { {"Avg", ncclAvg, 0} }; +NCCL_PARAM(NvtxDisable, "NVTX_DISABLE", 0); + // Must be called before the first call to any reduction operation. void initNvtxRegisteredEnums() { // Register schemas and strings + if (ncclParamNvtxDisable()) { + return; + } + constexpr const nvtxPayloadEnumAttr_t eAttr { .fieldMask = NVTX_PAYLOAD_ENUM_ATTR_ENTRIES | NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES | NVTX_PAYLOAD_ENUM_ATTR_SIZE | NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID, diff --git a/src/misc/CMakeLists.txt b/src/misc/CMakeLists.txt new file mode 100644 index 000000000..984becc5f --- /dev/null +++ b/src/misc/CMakeLists.txt @@ -0,0 +1,20 @@ +# Misc sources +set(MISC_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/strongstream.cc + ${CMAKE_CURRENT_SOURCE_DIR}/socket.cc + ${CMAKE_CURRENT_SOURCE_DIR}/ibvwrap.cc + ${CMAKE_CURRENT_SOURCE_DIR}/mlx5dvsymbols.cc + ${CMAKE_CURRENT_SOURCE_DIR}/mlx5dvwrap.cc + ${CMAKE_CURRENT_SOURCE_DIR}/cudawrap.cc + ${CMAKE_CURRENT_SOURCE_DIR}/param.cc + ${CMAKE_CURRENT_SOURCE_DIR}/ipcsocket.cc + ${CMAKE_CURRENT_SOURCE_DIR}/utils.cc + ${CMAKE_CURRENT_SOURCE_DIR}/shmutils.cc + ${CMAKE_CURRENT_SOURCE_DIR}/nvmlwrap.cc + ${CMAKE_CURRENT_SOURCE_DIR}/argcheck.cc + ${CMAKE_CURRENT_SOURCE_DIR}/gdrwrap.cc + ${CMAKE_CURRENT_SOURCE_DIR}/ibvsymbols.cc +) + +# Add misc sources to parent scope +set(MISC_SOURCES ${MISC_SOURCES} PARENT_SCOPE) diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc index 5b66fea92..1ecb35fb2 100644 --- a/src/misc/cudawrap.cc +++ b/src/misc/cudawrap.cc @@ -9,6 +9,7 @@ #include "debug.h" #include "param.h" #include "cudawrap.h" +#include // This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2); @@ -153,6 +154,12 @@ DECLARE_CUDA_PFN(cuMulticastCreate, 12010); DECLARE_CUDA_PFN(cuMulticastGetGranularity, 12010); DECLARE_CUDA_PFN(cuMulticastUnbind, 12010); #endif +/* Stream MemOp support */ +DECLARE_CUDA_PFN(cuStreamBatchMemOp, 11070); +DECLARE_CUDA_PFN(cuStreamWaitValue32, 11070); +DECLARE_CUDA_PFN(cuStreamWaitValue64, 11070); +DECLARE_CUDA_PFN(cuStreamWriteValue32, 11070); +DECLARE_CUDA_PFN(cuStreamWriteValue64, 11070); #endif #define CUDA_DRIVER_MIN_VERSION 11030 @@ -238,11 +245,17 @@ static ncclResult_t cudaPfnFuncLoader(void) { LOAD_SYM(cuMulticastGetGranularity, 12010, 1); LOAD_SYM(cuMulticastUnbind, 12010, 1); #endif +/* Stream MemOp support */ + LOAD_SYM(cuStreamBatchMemOp, 11070, 1); + LOAD_SYM(cuStreamWaitValue32, 11070, 1); + LOAD_SYM(cuStreamWaitValue64, 11070, 1); + LOAD_SYM(cuStreamWriteValue32, 11070, 1); + LOAD_SYM(cuStreamWriteValue64, 11070, 1); return ncclSuccess; } #endif -static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; +static std::once_flag initOnceFlag; static ncclResult_t initResult; static void initOnceFunc() { @@ -295,6 +308,6 @@ static void initOnceFunc() { } ncclResult_t ncclCudaLibraryInit() { - pthread_once(&initOnceControl, initOnceFunc); + std::call_once(initOnceFlag, initOnceFunc); return initResult; } diff --git a/src/misc/gdrwrap.cc b/src/misc/gdrwrap.cc index 3b46759c6..cef254cf2 100644 --- a/src/misc/gdrwrap.cc +++ b/src/misc/gdrwrap.cc @@ -5,6 +5,7 @@ ************************************************************************/ #include "gdrwrap.h" +#include #ifndef GDR_DIRECT #include "core.h" @@ -47,7 +48,7 @@ pthread_mutex_t gdrLock = PTHREAD_MUTEX_INITIALIZER; *cast = tmp; \ } while (0) -static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; +static std::once_flag initOnceFlag; static ncclResult_t initResult; static void initOnceFunc(void) { @@ -97,7 +98,7 @@ static void initOnceFunc(void) { ncclResult_t wrap_gdr_symbols(void) { - pthread_once(&initOnceControl, initOnceFunc); + std::call_once(initOnceFlag, initOnceFunc); return initResult; } diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc index 59f52e320..6d6586e78 100644 --- a/src/misc/ibvwrap.cc +++ b/src/misc/ibvwrap.cc @@ -7,6 +7,7 @@ #include "ibvwrap.h" #include #include +#include #ifdef NCCL_BUILD_RDMA_CORE #include @@ -15,12 +16,12 @@ #endif #include "ibvsymbols.h" -static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; +static std::once_flag initOnceFlag; static ncclResult_t initResult; struct ncclIbvSymbols ibvSymbols; ncclResult_t wrap_ibv_symbols(void) { - pthread_once(&initOnceControl, + std::call_once(initOnceFlag, [](){ initResult = buildIbvSymbols(&ibvSymbols); }); return initResult; } diff --git a/src/misc/mlx5dvwrap.cc b/src/misc/mlx5dvwrap.cc index 930ed5d2e..af4f41dff 100644 --- a/src/misc/mlx5dvwrap.cc +++ b/src/misc/mlx5dvwrap.cc @@ -7,6 +7,7 @@ #include "mlx5/mlx5dvwrap.h" #include #include +#include #ifdef NCCL_BUILD_MLX5DV #include @@ -15,12 +16,12 @@ #endif #include "mlx5/mlx5dvsymbols.h" -static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; +static std::once_flag initOnceFlag; static ncclResult_t initResult; struct ncclMlx5dvSymbols mlx5dvSymbols; ncclResult_t wrap_mlx5dv_symbols(void) { - pthread_once(&initOnceControl, + std::call_once(initOnceFlag, [](){ initResult = buildMlx5dvSymbols(&mlx5dvSymbols); }); return initResult; } @@ -28,7 +29,7 @@ ncclResult_t wrap_mlx5dv_symbols(void) { /* CHECK_NOT_NULL: helper macro to check for NULL symbol */ #define CHECK_NOT_NULL(container, internal_name) \ if (container.internal_name == NULL) { \ - WARN("lib wrapper not initialized."); \ + WARN("NET/MLX5: lib wrapper not initialized."); \ return ncclInternalError; \ } @@ -36,16 +37,7 @@ ncclResult_t wrap_mlx5dv_symbols(void) { CHECK_NOT_NULL(container, internal_name); \ retval = container.call; \ if (retval == error_retval) { \ - WARN("Call to " name " failed with error %s", strerror(errno)); \ - return ncclSystemError; \ - } \ - return ncclSuccess; - -#define MLX5DV_INT_CHECK_RET_ERRNO(container, internal_name, call, success_retval, name) \ - CHECK_NOT_NULL(container, internal_name); \ - int ret = container.call; \ - if (ret != success_retval) { \ - INFO(NCCL_NET, "Call to " name " failed with error %s errno %d", strerror(ret), ret); \ + WARN("NET/MLX5: Call to " name " failed with error %s", strerror(errno)); \ return ncclSystemError; \ } \ return ncclSuccess; @@ -57,8 +49,14 @@ bool wrap_mlx5dv_is_supported(struct ibv_device *device) { return mlx5dvSymbols.mlx5dv_internal_is_supported(device); } -ncclResult_t wrap_mlx5dv_get_data_direct_sysfs_path(struct ibv_context *context, char *buf, size_t buf_len) { - MLX5DV_INT_CHECK_RET_ERRNO(mlx5dvSymbols, mlx5dv_internal_get_data_direct_sysfs_path, mlx5dv_internal_get_data_direct_sysfs_path(context, buf, buf_len), 0, "mlx5dv_get_data_direct_sysfs_path"); +ncclResult_t wrap_mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len) { + CHECK_NOT_NULL(mlx5dvSymbols, mlx5dv_internal_get_data_direct_sysfs_path); + int ret = mlx5dvSymbols.mlx5dv_internal_get_data_direct_sysfs_path(context, buf, buf_len); + if (ret == 0) return ncclSuccess; + /* ENODEV can happen if the devices is not data-direct but mlx5 is used. It's not an error*/ + if (ret == ENODEV) return ncclInvalidArgument; + INFO(NCCL_NET, "NET/MLX5: Call to mlx5dv_internal_get_data_direct_sysfs_path failed with error %s errno %d", strerror(ret), ret); + return ncclSystemError; } /* DMA-BUF support */ @@ -72,4 +70,4 @@ struct ibv_mr * wrap_direct_mlx5dv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t off return NULL; } return mlx5dvSymbols.mlx5dv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access); -} \ No newline at end of file +} diff --git a/src/misc/nvmlwrap.cc b/src/misc/nvmlwrap.cc index 66ba2d4c8..d26a6facf 100644 --- a/src/misc/nvmlwrap.cc +++ b/src/misc/nvmlwrap.cc @@ -41,6 +41,7 @@ namespace { NCCL_NVML_FN(nvmlDeviceGetFieldValues, nvmlReturn_t, (nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values)) // MNNVL support NCCL_NVML_FN(nvmlDeviceGetGpuFabricInfoV, nvmlReturn_t, (nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo)) + NCCL_NVML_FN(nvmlDeviceGetPlatformInfo, nvmlReturn_t, (nvmlDevice_t device, nvmlPlatformInfo_t *platfromInfo)) // CC support NCCL_NVML_FN(nvmlSystemGetConfComputeState, nvmlReturn_t, (nvmlConfComputeSystemState_t *state)); NCCL_NVML_FN(nvmlSystemGetConfComputeSettings, nvmlReturn_t, (nvmlSystemConfComputeSettings_t *setting)); @@ -95,6 +96,7 @@ ncclResult_t ncclNvmlEnsureInitialized() { {(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"}, // MNNVL support {(void**)&pfn_nvmlDeviceGetGpuFabricInfoV, "nvmlDeviceGetGpuFabricInfoV"}, + {(void**)&pfn_nvmlDeviceGetPlatformInfo, "nvmlDeviceGetPlatformInfo"}, // CC support {(void**)&pfn_nvmlSystemGetConfComputeState, "nvmlSystemGetConfComputeState"}, {(void**)&pfn_nvmlSystemGetConfComputeSettings, "nvmlSystemGetConfComputeSettings"} @@ -298,6 +300,15 @@ ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricI return ncclSuccess; } +ncclResult_t ncclNvmlDeviceGetPlatformInfo(nvmlDevice_t device, nvmlPlatformInfo_t *platformInfo) { + NCCLCHECK(ncclNvmlEnsureInitialized()); + std::lock_guard locked(lock); + platformInfo->version = nvmlPlatformInfo_v2; + NVMLTRY(nvmlDeviceGetPlatformInfo, device, platformInfo); + return ncclSuccess; +} + + ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status) { NCCLCHECK(ncclNvmlEnsureInitialized()); std::lock_guard locked(lock); @@ -314,6 +325,10 @@ ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status) { status->multiGpuProtectedPCIE = true; else status->multiGpuProtectedPCIE = false; + if (ccInfo.settingV12040.multiGpuMode == NVML_CC_SYSTEM_MULTIGPU_NVLE) + status->multiGpuNVLE = true; + else + status->multiGpuNVLE = false; } else if (pfn_nvmlSystemGetConfComputeState != NULL) { NVMLTRY(nvmlSystemGetConfComputeState, &ccInfo.settingV12020); if (ccInfo.settingV12020.ccFeature == NVML_CC_SYSTEM_FEATURE_ENABLED) @@ -321,9 +336,11 @@ ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status) { else status->CCEnabled = false; status->multiGpuProtectedPCIE = false; + status->multiGpuNVLE = false; } else { status->CCEnabled = false; status->multiGpuProtectedPCIE = false; + status->multiGpuNVLE = false; } return ncclSuccess; } diff --git a/src/misc/param.cc b/src/misc/param.cc index d7c324fe9..9060b0066 100644 --- a/src/misc/param.cc +++ b/src/misc/param.cc @@ -15,6 +15,7 @@ #include #include #include +#include #include const char* userHomeDir() { @@ -67,13 +68,13 @@ static void initEnvFunc() { } void initEnv() { - static pthread_once_t once = PTHREAD_ONCE_INIT; - pthread_once(&once, initEnvFunc); + static std::once_flag once; + std::call_once(once, initEnvFunc); } void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) { - static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; - pthread_mutex_lock(&mutex); + static std::mutex mutex; + std::lock_guard lock(mutex); if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) { const char* str = ncclGetEnv(env); int64_t value = deftVal; @@ -89,7 +90,6 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6 } __atomic_store_n(cache, value, __ATOMIC_RELAXED); } - pthread_mutex_unlock(&mutex); } const char* ncclGetEnv(const char* name) { diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc index eb9cd1015..59adedf24 100644 --- a/src/misc/shmutils.cc +++ b/src/misc/shmutils.cc @@ -114,8 +114,11 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmPathSize, size_t shmSize, void } if (devShmPtr) { + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + CUDACHECKGOTO(cudaThreadExchangeStreamCaptureMode(&mode), ret, fail); CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterPortable | cudaHostRegisterMapped), ret, fail); CUDACHECKGOTO(cudaHostGetDevicePointer(&dptr, (void*)hptr, 0), ret, fail); + CUDACHECKGOTO(cudaThreadExchangeStreamCaptureMode(&mode), ret, fail); } shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle); @@ -182,34 +185,36 @@ ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) { ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize) { ncclResult_t ret = ncclSuccess; - int curRound; - size_t mycnt; + int nextRound = shmem->round + 1; + int curIndex = shmem->round % 2; + bool done; + int index = 0; + size_t maxTypeSize = shmem->maxTypeSize; - if (comm == NULL || shmem == NULL || sendbuff == NULL || recvbuff == NULL || shmem->maxTypeSize < typeSize) { + if (comm == NULL || shmem == NULL || sendbuff == NULL || recvbuff == NULL || maxTypeSize < typeSize) { ret = ncclInvalidArgument; goto exit; } - curRound = shmem->round; - memcpy((char*)shmem->ptr[curRound] + comm->localRank * typeSize, sendbuff, typeSize); - /* sync among local ranks */ - mycnt = __atomic_add_fetch(shmem->cnt[curRound], 1, __ATOMIC_ACQ_REL); - if (mycnt == comm->localRanks) { - *shmem->cnt[curRound ^ 1] = 0; /* prepare next round */ - __atomic_store_n(shmem->cnt[curRound], comm->localRanks + 1, __ATOMIC_RELEASE); /* release everyone */ - } else { - uint64_t t0 = clockNano(); - while(__atomic_load_n(shmem->cnt[curRound], __ATOMIC_ACQUIRE) != comm->localRanks + 1) { - if (clockNano() - t0 >= 5 * 1000) sched_yield(); - if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE) == 1) { - ret = ncclInternalError; - goto exit; + memcpy((char*)shmem->ptr[curIndex] + comm->localRank * maxTypeSize, sendbuff, typeSize); + /* reset the previous round and notify I arrive this round */ + __atomic_store_n((int*)((char*)shmem->cnt[curIndex] + CACHE_LINE_SIZE * comm->localRank), nextRound, __ATOMIC_RELEASE); + + do { + done = true; + for (int i = index; i < comm->localRanks; ++i) { + if (i != comm->localRank && __atomic_load_n((int*)((char*)shmem->cnt[curIndex] + CACHE_LINE_SIZE * i), __ATOMIC_ACQUIRE) < nextRound) { + done = false; + index = i; + break; } } - } + } while (!done); - memcpy(recvbuff, (const void*)shmem->ptr[curRound], comm->localRanks * typeSize); - shmem->round ^= 1; + for (int i = 0; i < comm->localRanks; ++i) { + memcpy((uint8_t*)recvbuff + i * typeSize, (uint8_t*)shmem->ptr[curIndex] + i * maxTypeSize, typeSize); + } + shmem->round = nextRound; exit: return ret; diff --git a/src/misc/socket.cc b/src/misc/socket.cc index d066d2829..5633fef3e 100644 --- a/src/misc/socket.cc +++ b/src/misc/socket.cc @@ -149,6 +149,9 @@ static ncclResult_t findInterfaces(const char* prefixList, char* names, union nc if (family != AF_INET && family != AF_INET6) continue; + /* Only consider running interfaces, i.e. UP and physically attached. */ + if (!(interface->ifa_flags & IFF_RUNNING)) continue; + TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, ncclSocketToString((union ncclSocketAddress *) interface->ifa_addr, line)); /* Allow the caller to force the socket family type */ @@ -377,11 +380,12 @@ ncclResult_t ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, NCCLCHECK(ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, nIfs)); } } - // Then look for anything else (but not docker or lo) - if (*nIfs == 0) NCCLCHECK(findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs)); + // Then look for anything else (but not docker,lo, or virtual) + if (*nIfs == 0) NCCLCHECK(findInterfaces("^docker,lo,virbr", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs)); // Finally look for docker, then lo. if (*nIfs == 0) NCCLCHECK(findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs)); if (*nIfs == 0) NCCLCHECK(findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs)); + if (*nIfs == 0) NCCLCHECK(findInterfaces("virbr", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs)); } return ncclSuccess; } diff --git a/src/misc/strongstream.cc b/src/misc/strongstream.cc index 1766f4167..d92b506cb 100644 --- a/src/misc/strongstream.cc +++ b/src/misc/strongstream.cc @@ -8,6 +8,7 @@ #include "cudawrap.h" #include "checks.h" #include "param.h" +#include #if CUDART_VERSION >= 13000 #define cudaStreamGetCaptureInfo_v3 cudaStreamGetCaptureInfo @@ -27,14 +28,14 @@ struct ncclStrongStreamCapture { //////////////////////////////////////////////////////////////////////////////// static ncclCudaContext* cxtListHead = nullptr; -static pthread_mutex_t cxtListLock = PTHREAD_MUTEX_INITIALIZER; +static std::mutex cxtListMutex; ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out) { ncclResult_t result = ncclSuccess; CUcontext hcontext; CUCHECK(cuCtxGetCurrent(&hcontext)); - pthread_mutex_lock(&cxtListLock); + std::lock_guard lock(cxtListMutex); struct ncclCudaContext* p = cxtListHead; while (1) { if (p == nullptr) { @@ -53,13 +54,12 @@ ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out) { p = p->next; } leave: - pthread_mutex_unlock(&cxtListLock); *out = p; return ncclSuccess; } void ncclCudaContextDrop(struct ncclCudaContext* cxt) { - pthread_mutex_lock(&cxtListLock); + std::lock_guard lock(cxtListMutex); if (0 == --cxt->refCount) { struct ncclCudaContext** pp = &cxtListHead; while (*pp != cxt) pp = &(*pp)->next; @@ -68,7 +68,6 @@ void ncclCudaContextDrop(struct ncclCudaContext* cxt) { ncclStrongStreamDestruct(&cxt->launchOrder); free(cxt); } - pthread_mutex_unlock(&cxtListLock); } //////////////////////////////////////////////////////////////////////////////// diff --git a/src/misc/utils.cc b/src/misc/utils.cc index bb59947e4..7e7179411 100644 --- a/src/misc/utils.cc +++ b/src/misc/utils.cc @@ -10,6 +10,7 @@ #include "nvmlwrap.h" #include +#include // Get current Compute Capability int ncclCudaCompCap() { @@ -107,8 +108,8 @@ static void getHostHashOnce() { hostHashValue = getHash(hostHash, strlen(hostHash)); } uint64_t getHostHash(void) { - static pthread_once_t once = PTHREAD_ONCE_INIT; - pthread_once(&once, getHostHashOnce); + static std::once_flag once; + std::call_once(once, getHostHashOnce); return hostHashValue; } @@ -289,3 +290,28 @@ void ncclMemoryStackDestruct(struct ncclMemoryStack* me) { h = h1; } } + +/* return concatenated string representing each set bit */ +ncclResult_t ncclBitsToString(uint32_t bits, uint32_t mask, const char* (*toStr)(int), char *buf, size_t bufLen, const char *wildcard) { + if (!buf || !bufLen) + return ncclInvalidArgument; + + bits &= mask; + + // print wildcard value if all bits set + if (wildcard && bits == mask) { + snprintf(buf, bufLen, "%s", wildcard); + return ncclSuccess; + } + + // Add each set bit to string + int pos = 0; + for (int i = 0; bits; i++, bits >>= 1) { + if (bits & 1) { + if (pos > 0) pos += snprintf(buf + pos, bufLen - pos, "|"); + pos += snprintf(buf + pos, bufLen - pos, "%s", toStr(i)); + } + } + + return ncclSuccess; +} diff --git a/src/mnnvl.cc b/src/mnnvl.cc index 34a18b80a..fb41106ab 100644 --- a/src/mnnvl.cc +++ b/src/mnnvl.cc @@ -36,7 +36,11 @@ ncclResult_t ncclMnnvlCheck(struct ncclComm* comm) { nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo; // Check if the cluster UUID and cliqueId match // A zero UUID means we don't have MNNVL fabric info - disable MNNVL - if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess; + unsigned long uuid0 = 0; + unsigned long uuid1 = 0; + memcpy(&uuid0, fabricInfo2->clusterUuid, sizeof(uuid0)); + memcpy(&uuid1, fabricInfo2->clusterUuid + sizeof(uuid0), sizeof(uuid1)); + if ((uuid0 | uuid1) == 0) return ncclSuccess; if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) && (fabricInfo1->cliqueId == fabricInfo2->cliqueId)) { if (i == comm->rank) { diff --git a/src/nccl.h.in b/src/nccl.h.in index 292a83914..0c53c826e 100644 --- a/src/nccl.h.in +++ b/src/nccl.h.in @@ -12,7 +12,7 @@ #if CUDART_VERSION >= 11000 #include #endif -#if CUDART_VERSION >= 11080 +#if __cplusplus && CUDART_VERSION >= 11080 #include #endif @@ -29,9 +29,10 @@ extern "C" { #endif #include + /* Opaque handle to communicator */ typedef struct ncclComm* ncclComm_t; -typedef struct ncclWindow* ncclWindow_t; +typedef struct ncclWindow_vidmem* ncclWindow_t; #define NCCL_COMM_NULL NULL #define NCCL_UNIQUE_ID_BYTES 128 @@ -57,9 +58,12 @@ typedef enum { ncclSuccess = 0, #define NCCL_WIN_DEFAULT 0x00 #define NCCL_WIN_COLL_SYMMETRIC 0x01 +#define NCCL_WIN_REQUIRED_ALIGNMENT 4096 + /* NCCL performance policy */ #define NCCL_CTA_POLICY_DEFAULT 0x00 #define NCCL_CTA_POLICY_EFFICIENCY 0x01 +#define NCCL_CTA_POLICY_ZERO 0x02 /* ncclCommShrink flags*/ #define NCCL_SHRINK_DEFAULT 0x00 /* shrink the parent communicator */ @@ -67,7 +71,7 @@ typedef enum { ncclSuccess = 0, /* Communicator configuration. Users can assign value to attributes to specify the * behavior of a communicator. */ -typedef struct ncclConfig_v22700 { +typedef struct ncclConfig_v22800 { /* attributes that users should never touch. */ size_t size; unsigned int magic; @@ -85,6 +89,8 @@ typedef struct ncclConfig_v22700 { int CTAPolicy; int shrinkShare; int nvlsCTAs; + int nChannelsPerNetPeer; + int nvlinkCentricSched; } ncclConfig_t; /* Config initializer must be assigned to initialize config structure when it is created. @@ -105,6 +111,8 @@ typedef struct ncclConfig_v22700 { NCCL_CONFIG_UNDEF_INT, /* CTAPolicy */ \ NCCL_CONFIG_UNDEF_INT, /* shrinkShare */ \ NCCL_CONFIG_UNDEF_INT, /* nvlsCTAs */ \ + NCCL_CONFIG_UNDEF_INT, /* nChannelsPerNetPeer */ \ + NCCL_CONFIG_UNDEF_INT, /* nvlinkCentricSched */ \ } /* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */ @@ -220,7 +228,9 @@ const char* ncclGetLastError(ncclComm_t comm); const char* pncclGetLastError(ncclComm_t comm); /* Reload environment variables that determine logging. */ +__attribute__ ((deprecated("ncclResetDebugInit is not supported as part of the NCCL API and will be removed in the future"))) void ncclResetDebugInit(); +__attribute__ ((deprecated("pncclResetDebugInit is not supported as part of the NCCL API and will be removed in the future"))) void pncclResetDebugInit(); /* Checks whether the comm has encountered any asynchronous errors */ @@ -427,6 +437,49 @@ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); +/* + * All-to-All + * + * Each device sends count values to all other devices and receives count values + * from all other devices. Data to send to destination rank j is taken from + * sendbuff+j*count and data received from source rank i is placed at + * recvbuff+i*count. + */ +ncclResult_t ncclAlltoAll(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclAlltoAll(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); + +/* + * Gather + * + * Each rank sends count elements from sendbuff to the root rank. + * On the root rank, data from rank i is placed at recvbuff + i*count. + * On non-root ranks, recvbuff is not used. + * root is the rank where data will be gathered. + * + * In-place operations will happen if sendbuff == recvbuff + root * count. + */ +ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); + +/* + * Scatter + * + * On the root rank, count elements from sendbuff+i*count are sent to rank i. + * On non-root ranks, sendbuff is not used. + * Each rank receives count elements into recvbuff. + * root is the rank that will distribute the data. + * + * In-place operations will happen if recvbuff == sendbuff + root * count. + */ +ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); + /* * Send * diff --git a/src/nccl_device/CMakeLists.txt b/src/nccl_device/CMakeLists.txt new file mode 100644 index 000000000..9d0c3d100 --- /dev/null +++ b/src/nccl_device/CMakeLists.txt @@ -0,0 +1,9 @@ +# Register sources +set(SYM_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/core.cc + ${CMAKE_CURRENT_SOURCE_DIR}/ll_a2a.cc + ${CMAKE_CURRENT_SOURCE_DIR}/mem_barrier.cc +) + +# Add register sources to parent scope +set(SYM_SOURCES ${SYM_SOURCES} PARENT_SCOPE) diff --git a/src/nccl_device/core.cc b/src/nccl_device/core.cc new file mode 100644 index 000000000..bae6b39bf --- /dev/null +++ b/src/nccl_device/core.cc @@ -0,0 +1,57 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" +#include "comm.h" +#include "nccl_device/impl/core__funcs.h" + +NCCL_API(ncclTeam_t, ncclTeamWorld, ncclComm_t comm); +ncclTeam_t ncclTeamWorld(ncclComm_t comm) { + ncclTeam_t ans; + ans.nRanks = comm->nRanks; + ans.rank = comm->rank; + ans.stride = 1; + return ans; +} + +NCCL_API(ncclTeam_t, ncclTeamLsa, ncclComm_t comm); +ncclTeam_t ncclTeamLsa(ncclComm_t comm) { + // Ignoring errors since if it fails ncclDevrInitOnce will try again. + // The returned team will be junk and the next "interesting" API call that + // needs ncclDevrInitOnce will report the error. + if (ncclSuccess != ncclDevrInitOnce(comm)) return ncclTeam_t{}; + + ncclTeam_t ans; + ans.nRanks = comm->devrState.lsaSize; + ans.rank = comm->devrState.lsaSelf; + ans.stride = 1; + return ans; +} + +NCCL_API(ncclTeam_t, ncclTeamRail, ncclComm_t comm); +ncclTeam_t ncclTeamRail(ncclComm_t comm) { + // Ignoring errors as above. + if (ncclSuccess != ncclDevrInitOnce(comm)) return ncclTeam_t{}; + + ncclTeam_t ans; + ans.nRanks = comm->nRanks/comm->devrState.lsaSize; + ans.rank = comm->rank/comm->devrState.lsaSize; + ans.stride = comm->devrState.lsaSize; + return ans; +} + +NCCL_API(int, ncclTeamRankToWorld, ncclComm_t comm, ncclTeam_t team, int rank); +int ncclTeamRankToWorld(ncclComm_t comm, ncclTeam_t team, int rank) { + return comm->rank + (rank - team.rank)*team.stride; +} + +NCCL_API(int, ncclTeamRankToLsa, ncclComm_t comm, ncclTeam_t team, int rank); +int ncclTeamRankToLsa(ncclComm_t comm, ncclTeam_t team, int rank) { + // Ignoring errors as above. + if (ncclSuccess != ncclDevrInitOnce(comm)) return -1; + + return comm->devrState.lsaSelf + (rank - team.rank)*team.stride; +} diff --git a/src/nccl_device/ll_a2a.cc b/src/nccl_device/ll_a2a.cc new file mode 100644 index 000000000..6a51d0f2b --- /dev/null +++ b/src/nccl_device/ll_a2a.cc @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" +#include "nccl_device/impl/ll_a2a__funcs.h" + +NCCL_API(int, ncclLLA2ACalcSlots, int maxElts, int maxEltSize); +int ncclLLA2ACalcSlots(int maxElts, int maxEltSize) { + return maxElts*divUp(maxEltSize, 8); +} + +NCCL_API(ncclResult_t, ncclLLA2ACreateRequirement, int nBlocks, int nSlots, ncclLLA2AHandle_t* outHandle, ncclDevResourceRequirements_t* outReq); +ncclResult_t ncclLLA2ACreateRequirement( + int nBlocks, int nSlots, ncclLLA2AHandle_t* outHandle, + ncclDevResourceRequirements_t* outReq + ) { + outHandle->nSlots = nSlots; + memset(outReq, 0, sizeof(*outReq)); + outReq->bufferSize = nBlocks*(1 + 2*nSlots)*16; + outReq->bufferAlign = 16; + outReq->outBufferHandle = &outHandle->bufHandle; + return ncclSuccess; +} diff --git a/src/nccl_device/mem_barrier.cc b/src/nccl_device/mem_barrier.cc new file mode 100644 index 000000000..b6c400fa4 --- /dev/null +++ b/src/nccl_device/mem_barrier.cc @@ -0,0 +1,21 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" +#include "nccl_device/impl/mem_barrier__funcs.h" + +NCCL_API(ncclResult_t, ncclLsaBarrierCreateRequirement, ncclTeam_t team, int nBarriers, ncclLsaBarrierHandle_t* outHandle, ncclDevResourceRequirements_t* outReq); +ncclResult_t ncclLsaBarrierCreateRequirement( + ncclTeam_t team, int nBarriers, ncclLsaBarrierHandle_t* outHandle, + ncclDevResourceRequirements_t* outReq + ) { + memset(outReq, 0, sizeof(*outReq)); + outHandle->nBarriers = nBarriers; + outReq->bufferSize = (3*nBarriers + nBarriers*team.nRanks)*sizeof(uint32_t); + outReq->bufferAlign = alignof(uint32_t); + outReq->outBufferHandle = &outHandle->bufHandle; + return ncclSuccess; +} diff --git a/src/plugin/CMakeLists.txt b/src/plugin/CMakeLists.txt new file mode 100644 index 000000000..2ef9282f6 --- /dev/null +++ b/src/plugin/CMakeLists.txt @@ -0,0 +1,18 @@ +# Add plugin subdirectories +add_subdirectory(profiler) +add_subdirectory(net) +add_subdirectory(tuner) + +# Plugin sources +set(PLUGIN_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/net.cc + ${CMAKE_CURRENT_SOURCE_DIR}/profiler.cc + ${CMAKE_CURRENT_SOURCE_DIR}/plugin_open.cc + ${CMAKE_CURRENT_SOURCE_DIR}/tuner.cc + ${PLUGIN_NET_SOURCES} + ${PLUGIN_PROFILER_SOURCES} + ${PLUGIN_TUNER_SOURCES} +) + +# Add plugin sources to parent scope +set(PLUGIN_SOURCES ${PLUGIN_SOURCES} PARENT_SCOPE) diff --git a/src/plugin/net.cc b/src/plugin/net.cc index aa80c12ab..6abd0804d 100644 --- a/src/plugin/net.cc +++ b/src/plugin/net.cc @@ -12,6 +12,7 @@ #include #include +#include //#include //#include //#include @@ -24,17 +25,19 @@ extern getNcclNet_t getNcclNet_v7; extern getNcclNet_t getNcclNet_v8; extern getNcclNet_t getNcclNet_v9; extern getNcclNet_t getNcclNet_v10; +extern getNcclNet_t getNcclNet_v11; extern getNcclCollNet_t getNcclCollNet_v6; extern getNcclCollNet_t getNcclCollNet_v7; extern getNcclCollNet_t getNcclCollNet_v8; extern getNcclCollNet_t getNcclCollNet_v9; extern getNcclCollNet_t getNcclCollNet_v10; +extern getNcclCollNet_t getNcclCollNet_v11; -NCCL_PARAM(NetPluginRefCount, "NET_PLUGIN_REF_COUNT", 1); -#define NCCL_NET_VERSION_COUNT 5 -int ncclNetVersion[NCCL_NET_VERSION_COUNT] = {10, 9, 8, 7, 6}; -getNcclNet_t* getNcclNet[NCCL_NET_VERSION_COUNT] = {getNcclNet_v10, getNcclNet_v9, getNcclNet_v8, getNcclNet_v7, getNcclNet_v6}; -getNcclCollNet_t* getNcclCollNet[NCCL_NET_VERSION_COUNT] = {getNcclCollNet_v10, getNcclCollNet_v9, getNcclCollNet_v8, getNcclCollNet_v7, getNcclCollNet_v6}; +NCCL_PARAM(NetPluginRefCount, "NET_PLUGIN_REF_COUNT", 0); +#define NCCL_NET_VERSION_COUNT 6 +int ncclNetVersion[NCCL_NET_VERSION_COUNT] = {11, 10, 9, 8, 7, 6}; +getNcclNet_t* getNcclNet[NCCL_NET_VERSION_COUNT] = {getNcclNet_v11, getNcclNet_v10, getNcclNet_v9, getNcclNet_v8, getNcclNet_v7, getNcclNet_v6}; +getNcclCollNet_t* getNcclCollNet[NCCL_NET_VERSION_COUNT] = {getNcclCollNet_v11, getNcclCollNet_v10, getNcclCollNet_v9, getNcclCollNet_v8, getNcclCollNet_v7, getNcclCollNet_v6}; #define NCCL_NET_NUM_INTERNAL_PLUGINS 2 @@ -56,19 +59,27 @@ typedef struct netPluginLib { ncclNetPluginState_t ncclNetPluginState; // State of the nccl net plugin ncclNetPluginState_t ncclCollNetPluginState; // State of the nccl coll net plugin int ncclNetPluginRefCount; // Reference count for the nccl net plugin + int netPhysDevs; // ncclNet - number of physical devices + int netVirtDevs; // ncclNet - number of virtual devices + int collNetPhysDevs; // ncclCollNet - number of physical devices + int collNetVirtDevs; // ncclCollNet - number of virtual devices } netPluginLib_t; int pluginCount = 0; bool netPluginLibsInitialized = false; netPluginLib_t netPluginLibs[NCCL_NET_MAX_PLUGINS] = { 0 }; -static pthread_mutex_t netPluginLock = PTHREAD_MUTEX_INITIALIZER; -static pthread_once_t initPluginLibsOnceControl = PTHREAD_ONCE_INIT; +static std::mutex netPluginMutex; +static std::once_flag initPluginLibsOnceFlag; static ncclResult_t ncclNetPluginUnload(netPluginLib_t* pluginLib) { if ((pluginLib->dlHandle) && ((pluginLib->ncclNetPluginRefCount) == 0)) { INFO(NCCL_INIT|NCCL_NET, "Unloading plugin %s", pluginLib->name); NCCLCHECK(ncclClosePluginLib(pluginLib->dlHandle, ncclPluginTypeNet)); + // memset will reset the status to ncllNetPluginStateLoadReady memset(pluginLib, 0, sizeof(netPluginLib_t)); + // reset the count of devices to UNDEF_DEV_COUNT + pluginLib->netPhysDevs = pluginLib->netVirtDevs = NCCL_UNDEF_DEV_COUNT; + pluginLib->collNetPhysDevs = pluginLib->collNetVirtDevs = NCCL_UNDEF_DEV_COUNT; } return ncclSuccess; } @@ -85,11 +96,15 @@ static ncclResult_t ncclNetPluginLoad(netPluginLib_t* pluginLib) { } // if we fail to find a net, exit - if (pluginLib->ncclNet == nullptr) goto fail; + if (pluginLib->ncclNet == nullptr) { + INFO(NCCL_INIT|NCCL_NET, "External network plugin %s is unsupported", + (ncclPluginLibPaths[ncclPluginTypeNet] ? ncclPluginLibPaths[ncclPluginTypeNet] : pluginLib->name)); + goto fail; + } pluginLib->ncclNetPluginState = ncclNetPluginStateInitReady; - // load ncclColNet + // load ncclCollNet for (int i = 0; i < NCCL_NET_VERSION_COUNT; i++) { pluginLib->ncclCollNet = getNcclCollNet[i](pluginLib->dlHandle); if (pluginLib->ncclCollNet) break; @@ -100,7 +115,8 @@ static ncclResult_t ncclNetPluginLoad(netPluginLib_t* pluginLib) { else pluginLib->ncclCollNetPluginState = ncclNetPluginStateInitReady; - INFO(NCCL_INIT|NCCL_NET, "Successfully loaded external plugin %s", pluginLib->name); + INFO(NCCL_INIT|NCCL_NET, "Successfully loaded external network plugin %s", + (ncclPluginLibPaths[ncclPluginTypeNet] ? ncclPluginLibPaths[ncclPluginTypeNet] : pluginLib->name)); exit: return ncclSuccess; fail: @@ -137,25 +153,35 @@ ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, in return ncclSuccess; } -static ncclResult_t ncclNetPluginInit(netPluginLib_t* pluginLib) { +static ncclResult_t ncclNetPluginInit(struct ncclComm* comm, netPluginLib_t* pluginLib) { int ndev; - if (pluginLib->ncclNetPluginState == ncclNetPluginStateInitReady && pluginLib->ncclNet) { - if (pluginLib->ncclNet->init(ncclDebugLog, ncclProfilerCallback) != ncclSuccess) goto fail; + if (pluginLib->ncclNetPluginState >= ncclNetPluginStateInitReady && pluginLib->ncclNet) { + ncclNetCommConfig_t commConfig = {}; + commConfig.trafficClass = comm->config.trafficClass == NCCL_CONFIG_UNDEF_INT ? NCCL_NET_TRAFFIC_CLASS_UNDEF : comm->config.trafficClass; + if (pluginLib->ncclNet->init(&comm->netContext, comm->commHash, &commConfig, ncclDebugLog, ncclProfilerCallback) != ncclSuccess) goto fail; if (pluginLib->ncclNet->devices(&ndev) != ncclSuccess || ndev <= 0) goto fail; + pluginLib->netPhysDevs = ndev; + pluginLib->netVirtDevs = NCCL_UNDEF_DEV_COUNT; } pluginLib->ncclNetPluginState = ncclNetPluginStateEnabled; INFO(NCCL_INIT|NCCL_NET, "Initialized NET plugin %s", pluginLib->ncclNet->name); - if (pluginLib->ncclCollNetPluginState == ncclNetPluginStateInitReady && pluginLib->ncclCollNet) { - if (pluginLib->ncclCollNet->init(ncclDebugLog) != ncclSuccess) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled; + if (pluginLib->ncclCollNetPluginState >= ncclNetPluginStateInitReady && pluginLib->ncclCollNet) { + if (pluginLib->ncclCollNet->init(&comm->collNetContext, comm->commHash, ncclDebugLog) != ncclSuccess) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled; else if (pluginLib->ncclCollNet->devices(&ndev) != ncclSuccess || ndev <= 0) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled; else { + pluginLib->collNetPhysDevs = ndev; + pluginLib->collNetVirtDevs = NCCL_UNDEF_DEV_COUNT; pluginLib->ncclCollNetPluginState = ncclNetPluginStateEnabled; } } exit: return ncclSuccess; fail: + INFO(NCCL_INIT|NCCL_NET, "Failed to initialize NET plugin %s", pluginLib->ncclNet->name); + pluginLib->ncclNet->finalize(comm->netContext); + pluginLib->netPhysDevs = pluginLib->netVirtDevs = NCCL_UNDEF_DEV_COUNT; + pluginLib->collNetPhysDevs = pluginLib->collNetVirtDevs = NCCL_UNDEF_DEV_COUNT; pluginLib->ncclNetPluginState = ncclNetPluginStateDisabled; pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled; goto exit; @@ -214,6 +240,9 @@ static void initPluginLibsOnceFunc() { memset(netPluginLibs, 0, NCCL_NET_MAX_PLUGINS * sizeof(netPluginLib_t)); envNetPlugin = ncclGetEnv("NCCL_NET_PLUGIN"); if (envNetPlugin) { + INFO(NCCL_ENV|NCCL_NET, "NCCL_NET_PLUGIN set by environment to %s", envNetPlugin); + if (strcasecmp(envNetPlugin, "none") == 0) + envNetPlugin = ""; envNetPluginList = strdup(envNetPlugin); // Iterate over list until the list is empty netPluginName = strtok_r(envNetPluginList, ",", &savePtr); @@ -221,7 +250,7 @@ static void initPluginLibsOnceFunc() { // We have 2 internal plugins (ib and socket) // So, we can have at most( NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS)) in the NCCL_NET_PLUGIN list if (pluginCounter >= (NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS))) { - INFO(NCCL_NET|NCCL_INIT,"NCCL_NET_PLUGIN list contains more than %d plugins, ignoring the rest", (NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS + 1))); + INFO(NCCL_NET|NCCL_ENV,"NCCL_NET_PLUGIN list contains more than %d plugins, ignoring the rest", (NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS + 1))); break; } // need to leave space for the name + "\n" @@ -231,7 +260,7 @@ static void initPluginLibsOnceFunc() { strcpy(netPluginLibs[pluginCounter].name, netPluginName); pluginCounter++; } else { - INFO(NCCL_NET|NCCL_INIT,"NCCL_NET_PLUGIN list contains a plugin name %s longer than %d characters, ignoring it.", netPluginName, MAX_STR_LEN); + INFO(NCCL_NET|NCCL_ENV,"NCCL_NET_PLUGIN list contains a plugin name %s longer than %d characters, ignoring it.", netPluginName, MAX_STR_LEN); } netPluginName = strtok_r(nullptr, ",", &savePtr); } @@ -253,14 +282,14 @@ static void initPluginLibsOnceFunc() { ncclResult_t ncclNetInit(struct ncclComm* comm) { bool ncclNetPluginInitialized = false; - pthread_once(&initPluginLibsOnceControl, initPluginLibsOnceFunc); - pthread_mutex_lock(&netPluginLock); + std::call_once(initPluginLibsOnceFlag, initPluginLibsOnceFunc); + std::lock_guard lock(netPluginMutex); for (int pluginIndex = 0; pluginIndex < pluginCount; pluginIndex++) { if ((pluginIndex < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS)) && (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateLoadReady)) { NCCLCHECK(ncclNetPluginLoad(&netPluginLibs[pluginIndex])); } - if (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateInitReady) { - NCCLCHECK(ncclNetPluginInit(&netPluginLibs[pluginIndex])); + if (netPluginLibs[pluginIndex].ncclNetPluginState >= ncclNetPluginStateInitReady) { + NCCLCHECK(ncclNetPluginInit(comm, &netPluginLibs[pluginIndex])); } if (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateEnabled) { bool isAssigned = false; @@ -273,7 +302,6 @@ ncclResult_t ncclNetInit(struct ncclComm* comm) { } } } - pthread_mutex_unlock(&netPluginLock); if (ncclNetPluginInitialized) return ncclSuccess; WARN("Failed to initialize any NET plugin"); return ncclInvalidUsage; @@ -281,15 +309,60 @@ ncclResult_t ncclNetInit(struct ncclComm* comm) { ncclResult_t ncclNetFinalize(struct ncclComm* comm) { int pluginIndex = comm->netPluginIndex; - pthread_mutex_lock(&netPluginLock); + std::lock_guard lock(netPluginMutex); + NCCLCHECK(comm->ncclNet->finalize(comm->netContext)); + if (comm->collNetContext) NCCLCHECK(comm->ncclCollNet->finalize(comm->collNetContext)); netPluginLibs[pluginIndex].ncclNetPluginRefCount--; for (int i = 0; i < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS); i++) { NCCLCHECK(ncclNetPluginUnload(&netPluginLibs[i])); } - pthread_mutex_unlock(&netPluginLock); return ncclSuccess; } +ncclResult_t ncclNetGetDevCount(int netPluginIndex, int* nPhysDevs, int* nVirtDevs) { + if (netPluginLibs[netPluginIndex].ncclNetPluginState != ncclNetPluginStateEnabled || + netPluginLibs[netPluginIndex].netPhysDevs == NCCL_UNDEF_DEV_COUNT) goto fail; + // lock not needed as it's called within a lock already in ncclTopoGetSystem + *nPhysDevs = netPluginLibs[netPluginIndex].netPhysDevs; + *nVirtDevs = netPluginLibs[netPluginIndex].netVirtDevs; + return ncclSuccess; +fail: + WARN("%s: trying to access the number of devices of an uninitialized netPlugin[%d]", __func__, netPluginIndex); + return ncclInternalError; +} + +ncclResult_t ncclCollNetGetDevCount(int netPluginIndex, int* nPhysDevs, int* nVirtDevs) { + if (netPluginLibs[netPluginIndex].ncclCollNetPluginState != ncclNetPluginStateEnabled || + netPluginLibs[netPluginIndex].collNetPhysDevs == NCCL_UNDEF_DEV_COUNT) goto fail; + // lock not needed as it's called within a lock already in ncclTopoGetSystem + *nPhysDevs = netPluginLibs[netPluginIndex].collNetPhysDevs; + *nVirtDevs = netPluginLibs[netPluginIndex].collNetVirtDevs; + return ncclSuccess; +fail: + WARN("%s: trying to access the number of devices of an uninitialized netPlugin[%d]", __func__, netPluginIndex); + return ncclInternalError; +} + +ncclResult_t ncclNetSetVirtDevCount(int netPluginIndex, int nVirtDevs) { + if (netPluginLibs[netPluginIndex].ncclNetPluginState != ncclNetPluginStateEnabled || nVirtDevs < 0) goto fail; + // lock not needed as it's called within a lock already in ncclTopoGetSystem + netPluginLibs[netPluginIndex].netVirtDevs = nVirtDevs; + return ncclSuccess; +fail: + WARN("%s: failed to set the number of devices for netPlugin[%d] to %d", __func__, netPluginIndex,nVirtDevs); + return ncclInternalError; +} + +ncclResult_t ncclCollNetSetVirtDevCount(int netPluginIndex, int nVirtDevs) { + if (netPluginLibs[netPluginIndex].ncclCollNetPluginState != ncclNetPluginStateEnabled || nVirtDevs < 0) goto fail; + // lock not needed as it's called within a lock already in ncclTopoGetSystem + netPluginLibs[netPluginIndex].collNetVirtDevs = nVirtDevs; + return ncclSuccess; +fail: + WARN("%s: failed to set the number of devices for netPlugin[%d] to %d", __func__, netPluginIndex,nVirtDevs); + return ncclInternalError; +} + ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) { constexpr int GPU_BUF_SIZE = 2*1024*1024; #if CUDART_VERSION >= 11030 @@ -324,7 +397,7 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) { void* mHandle = NULL; ncclResult_t ret; ncclDebugNoWarn = NCCL_NET; - NCCLCHECKGOTO(comm->ncclNet->listen(dev, &handle, &lComm), ret, cleanup1); + NCCLCHECKGOTO(comm->ncclNet->listen(comm->netContext, dev, &handle, &lComm), ret, cleanup1); bool connected; connected = false; @@ -336,7 +409,7 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) { } if (sComm == NULL) - NCCLCHECKGOTO(comm->ncclNet->connect(dev, NULL, &handle, &sComm, NULL), ret, cleanup2); + NCCLCHECKGOTO(comm->ncclNet->connect(comm->netContext, dev, &handle, &sComm, NULL), ret, cleanup2); if (rComm == NULL) NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2); diff --git a/src/plugin/net/CMakeLists.txt b/src/plugin/net/CMakeLists.txt new file mode 100644 index 000000000..0a6fcb237 --- /dev/null +++ b/src/plugin/net/CMakeLists.txt @@ -0,0 +1,12 @@ +# Net plugin sources +set(PLUGIN_NET_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/net_v9.cc + ${CMAKE_CURRENT_SOURCE_DIR}/net_v6.cc + ${CMAKE_CURRENT_SOURCE_DIR}/net_v7.cc + ${CMAKE_CURRENT_SOURCE_DIR}/net_v8.cc + ${CMAKE_CURRENT_SOURCE_DIR}/net_v10.cc + ${CMAKE_CURRENT_SOURCE_DIR}/net_v11.cc +) + +# Add net plugin sources to parent scope +set(PLUGIN_NET_SOURCES ${PLUGIN_NET_SOURCES} PARENT_SCOPE) diff --git a/src/plugin/net/net_v10.cc b/src/plugin/net/net_v10.cc index 682f239f7..591a57ac0 100644 --- a/src/plugin/net/net_v10.cc +++ b/src/plugin/net/net_v10.cc @@ -7,26 +7,203 @@ #include "nccl_net.h" #include "net_device.h" #include "proxy.h" +#include "checks.h" +#include +static ncclNet_t ncclNet; +static ncclCollNet_t ncclCollNet; static ncclNet_v10_t* ncclNet_v10; static ncclCollNet_v10_t* ncclCollNet_v10; +#define NET_INDEX 0 +#define COLLNET_INDEX 1 +#define INDEX_NUMS 2 +static int refCount[INDEX_NUMS]; + +static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { + ncclNetProperties_v10_t props_v10; + NCCLCHECK(ncclNet_v10->getProperties(dev, &props_v10)); + props->name = props_v10.name; + props->pciPath = props_v10.pciPath; + props->guid = props_v10.guid; + props->ptrSupport = props_v10.ptrSupport; + props->regIsGlobal = props_v10.regIsGlobal; + props->forceFlush = props_v10.forceFlush; + props->speed = props_v10.speed; + props->port = props_v10.port; + props->latency = props_v10.latency; + props->maxComms = props_v10.maxComms; + props->maxRecvs = props_v10.maxRecvs; + props->netDeviceType = props_v10.netDeviceType; + props->netDeviceVersion = props_v10.netDeviceVersion; + props->vProps.ndevs = props_v10.vProps.ndevs; + for (int i = 0; i < props->vProps.ndevs; i++) { + props->vProps.devs[i] = props_v10.vProps.devs[i]; + } + props->maxP2pBytes = props_v10.maxP2pBytes; + props->maxCollBytes = props_v10.maxCollBytes; + props->maxMultiRequestSize = 1; + return ncclSuccess; +} + +static ncclResult_t ncclNet_listen(void* ctx __attribute__((unused)), + int dev, void* handle, void** listenComm) { + return ncclNet_v10->listen(dev, handle, listenComm); +} + +static ncclResult_t ncclNet_connect(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { + return ncclNet_v10->connect(dev, (ncclNetCommConfig_v10_t *)ctx, handle, sendComm, sendDevComm); +} + +static ncclResult_t ncclNet_makeVDevice(int* d, ncclNetVDeviceProps_v11_t* props) { + return ncclNet_v10->makeVDevice(d, (ncclNetVDeviceProps_v10_t *)props); +} + +static ncclResult_t ncclNet_finalize(void* ctx) { + refCount[NET_INDEX]--; + free(ctx); + return ncclSuccess; +} + +static ncclResult_t ncclNet_init(void** ctx, uint64_t commId __attribute__((unused)), + ncclNetCommConfig_t* config, ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) { + // since ncclNet_v11, the ncclNetCommConfig_t has been moved from connect to init. Since the config is per comm, + // this allows the config to be passed only once, instead of multiple times (once per connect). To preserve the + // ncclNet_v10 behavior, in the compat layer, we store the config in the context pointer and pass it to the connect + // function. + ncclNetCommConfig_v10_t* config_v10 = nullptr; + NCCLCHECK(ncclCalloc(&config_v10, 1)); + config_v10->trafficClass = config->trafficClass; + *ctx = config_v10; + // before ncclNet_v11 the net plugin was initialized only once. With ncclNet_v11 this is no longer the case. + // The compat layer preserves the ncclNet_v10 behavior using a refCount to track the number of times the plugin + // is initialized, and avoid initializing it multiple times. + if (refCount[NET_INDEX]++) return ncclSuccess; + NCCLCHECK(ncclNet_v10->init(logfn, proffn)); + ncclNet.devices = ncclNet_v10->devices; + ncclNet.getProperties = ncclNet_getProperties; + ncclNet.listen = ncclNet_listen; + ncclNet.connect = ncclNet_connect; + ncclNet.accept = ncclNet_v10->accept; + ncclNet.regMr = ncclNet_v10->regMr; + ncclNet.regMrDmaBuf = ncclNet_v10->regMrDmaBuf; + ncclNet.deregMr = ncclNet_v10->deregMr; + ncclNet.isend = ncclNet_v10->isend; + ncclNet.irecv = ncclNet_v10->irecv; + ncclNet.iflush = ncclNet_v10->iflush; + ncclNet.test = ncclNet_v10->test; + ncclNet.closeSend = ncclNet_v10->closeSend; + ncclNet.closeRecv = ncclNet_v10->closeRecv; + ncclNet.closeListen = ncclNet_v10->closeListen; + ncclNet.getDeviceMr = ncclNet_v10->getDeviceMr; + ncclNet.irecvConsumed = ncclNet_v10->irecvConsumed; + ncclNet.makeVDevice = (ncclNet_v10->makeVDevice) ? ncclNet_makeVDevice : nullptr; + ncclNet.finalize = ncclNet_finalize; + ncclNet.setNetAttr = nullptr; + return ncclSuccess; +} + ncclNet_t* getNcclNet_v10(void* lib) { ncclNet_v10 = (ncclNet_v10_t*)dlsym(lib, "ncclNetPlugin_v10"); if (ncclNet_v10) { + ncclNet.name = ncclNet_v10->name; + ncclNet.init = ncclNet_init; INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v10)", ncclNet_v10->name); - return ncclNet_v10; + return &ncclNet; } - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v10 symbol."); return nullptr; } +static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) { + ncclNetProperties_v10_t props_v10; + NCCLCHECK(ncclCollNet_v10->getProperties(dev, &props_v10)); + props->name = props_v10.name; + props->pciPath = props_v10.pciPath; + props->guid = props_v10.guid; + props->ptrSupport = props_v10.ptrSupport; + props->regIsGlobal = props_v10.regIsGlobal; + props->forceFlush = props_v10.forceFlush; + props->speed = props_v10.speed; + props->port = props_v10.port; + props->latency = props_v10.latency; + props->maxComms = props_v10.maxComms; + props->maxRecvs = props_v10.maxRecvs; + props->netDeviceType = props_v10.netDeviceType; + props->netDeviceVersion = props_v10.netDeviceVersion; + props->vProps.ndevs = props_v10.vProps.ndevs; + for (int i = 0; i < props->vProps.ndevs; i++) { + props->vProps.devs[i] = props_v10.vProps.devs[i]; + } + props->maxP2pBytes = props_v10.maxP2pBytes; + props->maxCollBytes = props_v10.maxCollBytes; + props->maxMultiRequestSize = 1; + return ncclSuccess; +} + +static ncclResult_t ncclCollNet_listen(void* ctx __attribute__((unused)), + int dev, void* handle , void** listenComm) { + return ncclCollNet_v10->listen(dev, handle, listenComm); +} + +static ncclResult_t ncclCollNet_iallgather(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_t* recvParts, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + void* sendMhandle, void** request) { + return ncclCollNet_v10->iallgather(collComm, sendData, nRecvParts, (ncclNetSGE_v10_t*)recvParts, bytesPerRank, + windowOffset, windowBytes, sendMhandle, request); +} + +static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_t* sendParts, void* recvData, + size_t bytesPerRank, size_t windowOffset, size_t windowBytes, + ncclDataType_t dataType, ncclRedOp_t redOp, + void* recvMhandle, void** request) { + return ncclCollNet_v10->ireducescatter(collComm, nSendParts, (ncclNetSGE_v10_t*)sendParts, recvData, bytesPerRank, + windowOffset, windowBytes, dataType, redOp, recvMhandle, request); +} + +static ncclResult_t ncclCollNet_makeVDevice(int* d, ncclNetVDeviceProps_t* props) { + return ncclCollNet_v10->makeVDevice(d, (ncclNetVDeviceProps_v10_t *)props); +} + +static ncclResult_t ncclCollNet_finalize(void* ctx __attribute__((unused))) { + refCount[COLLNET_INDEX]--; + return ncclSuccess; +} + +static ncclResult_t ncclCollNet_init(void** ctx __attribute__((unused)), + uint64_t commId __attribute__((unused)), + ncclDebugLogger_t logfn) { + // before ncclCollNet_v11 the collnet plugin was initialized only once. With ncclCollNet_v11 this is no longer the case. + // The compat layer preserves the ncclCollNet_v10 behavior using a refCount to track the number of times the plugin + // is initialized, and avoid initializing it multiple times. + if (refCount[COLLNET_INDEX]++) return ncclSuccess; + NCCLCHECK(ncclCollNet_v10->init(logfn)); + ncclCollNet.devices = ncclCollNet_v10->devices; + ncclCollNet.getProperties = ncclCollNet_getProperties; + ncclCollNet.listen = ncclCollNet_listen; + ncclCollNet.connect = ncclCollNet_v10->connect; + ncclCollNet.reduceSupport = ncclCollNet_v10->reduceSupport; + ncclCollNet.regMr = ncclCollNet_v10->regMr; + ncclCollNet.regMrDmaBuf = ncclCollNet_v10->regMrDmaBuf; + ncclCollNet.deregMr = ncclCollNet_v10->deregMr; + ncclCollNet.iallreduce = ncclCollNet_v10->iallreduce; + ncclCollNet.iallgather = ncclCollNet_iallgather; + ncclCollNet.ireducescatter = ncclCollNet_ireducescatter; + ncclCollNet.iflush = ncclCollNet_v10->iflush; + ncclCollNet.test = ncclCollNet_v10->test; + ncclCollNet.closeColl = ncclCollNet_v10->closeColl; + ncclCollNet.closeListen = ncclCollNet_v10->closeListen; + ncclCollNet.makeVDevice = ncclCollNet_makeVDevice; + ncclCollNet.finalize = ncclCollNet_finalize; + return ncclSuccess; +} + ncclCollNet_t* getNcclCollNet_v10(void* lib) { ncclCollNet_v10 = (ncclCollNet_v10_t*)dlsym(lib, "ncclCollNetPlugin_v10"); if (ncclCollNet_v10) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v10)", ncclNet_v10->name); - return ncclCollNet_v10; + ncclCollNet.name = ncclCollNet_v10->name; + ncclCollNet.init = ncclCollNet_init; + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v10)", ncclCollNet_v10->name); + return &ncclCollNet; } - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v10 symbol."); return nullptr; } diff --git a/src/plugin/net/net_v11.cc b/src/plugin/net/net_v11.cc new file mode 100644 index 000000000..b13a0efb9 --- /dev/null +++ b/src/plugin/net/net_v11.cc @@ -0,0 +1,31 @@ +/************************************************************************* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "nccl_net.h" +#include "net_device.h" +#include "proxy.h" +#include + +static ncclNet_v11_t* ncclNet_v11; +static ncclCollNet_v11_t* ncclCollNet_v11; + +ncclNet_t* getNcclNet_v11(void* lib) { + ncclNet_v11 = (ncclNet_v11_t*)dlsym(lib, "ncclNetPlugin_v11"); + if (ncclNet_v11) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v11)", ncclNet_v11->name); + return ncclNet_v11; + } + return nullptr; +} + +ncclCollNet_t* getNcclCollNet_v11(void* lib) { + ncclCollNet_v11 = (ncclCollNet_v11_t*)dlsym(lib, "ncclCollNetPlugin_v11"); + if (ncclCollNet_v11) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v11)", ncclCollNet_v11->name); + return ncclCollNet_v11; + } + return nullptr; +} diff --git a/src/plugin/net/net_v6.cc b/src/plugin/net/net_v6.cc index baff67935..73eb8614d 100644 --- a/src/plugin/net/net_v6.cc +++ b/src/plugin/net/net_v6.cc @@ -8,12 +8,18 @@ #include "net_device.h" #include "proxy.h" #include "checks.h" +#include static ncclNet_t ncclNet; static ncclCollNet_t ncclCollNet; static ncclNet_v6_t* ncclNet_v6; static ncclCollNet_v6_t* ncclCollNet_v6; +#define NET_INDEX 0 +#define COLLNET_INDEX 1 +#define INDEX_NUMS 2 +static int refCount[INDEX_NUMS]; + static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { ncclNetProperties_v6_t p6; ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6); @@ -35,6 +41,7 @@ static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { props->vProps.devs[0] = dev; props->maxP2pBytes = MAX_NET_SIZE; props->maxCollBytes = MAX_COLLNET_SIZE; + props->maxMultiRequestSize = 1; return ncclSuccess; } @@ -43,7 +50,14 @@ static ncclResult_t ncclNet_regMr(void* comm, void* data, size_t size, int type, return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle); } -static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { +static ncclResult_t ncclNet_listen(void* ctx __attribute__((unused)), + int d, void* handle, void** listenComm) { + return ncclNet_v6->listen(d, handle, listenComm); +} + +static ncclResult_t ncclNet_connect(void* ctx __attribute__((unused)), + int dev, + void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { return ncclNet_v6->connect(dev, handle, sendComm); } @@ -51,7 +65,9 @@ static ncclResult_t ncclNet_accept(void* listenComm, void** recvComm, ncclNetDev return ncclNet_v6->accept(listenComm, recvComm); } -static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) { +static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, + void* pHandle __attribute__((unused)), + void** request) { int sizeInt; if (size > MAX_NET_SIZE) return ncclInternalError; sizeInt = (int)size; @@ -59,7 +75,9 @@ static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int t return ans; } -static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) { +static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, + void** pHandles __attribute__((unused)), + void** request) { int sizesInt[NCCL_PROXY_MAX_SUBS]; //reset to nullptr if optional receive completion is set if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr; @@ -71,6 +89,11 @@ static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* si return ans; } +static ncclResult_t ncclNet_finalize(void* ctx __attribute__((unused))) { + refCount[NET_INDEX]--; + return ncclSuccess; +} + static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) { ncclNetProperties_v6_t p6; ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6); @@ -92,9 +115,15 @@ static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* prop props->vProps.devs[0] = dev; props->maxP2pBytes = MAX_NET_SIZE; props->maxCollBytes = MAX_COLLNET_SIZE; + props->maxMultiRequestSize = 1; return ncclSuccess; } +static ncclResult_t ncclCollNet_listen(void* ctx __attribute__((unused)), + int d, void* handle, void** listenComm) { + return ncclCollNet_v6->listen(d, handle, listenComm); +} + static ncclResult_t ncclCollNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { if (size >= 1UL<<31) return ncclInternalError; return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle); @@ -110,11 +139,24 @@ static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* return ans; } -static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) { +static ncclResult_t ncclCollNet_finalize(void* ctx __attribute__((unused))) { + refCount[COLLNET_INDEX]--; + return ncclSuccess; +} + +static ncclResult_t ncclNet_init(void** ctx __attribute__((unused)), + uint64_t commId __attribute__((unused)), + ncclNetCommConfig_t* config __attribute__((unused)), + ncclDebugLogger_t logfn, + ncclProfilerCallback_t proffn __attribute__((unused))) { + // before ncclNet_v11 the net plugin was initialized only once. With ncclNet_v11 this is no longer the case. + // The compat layer preserves the ncclNet_v6 behavior using a refCount to track the number of times the plugin + // is initialized, and avoid initializing it multiple times. + if (refCount[NET_INDEX]++) return ncclSuccess; NCCLCHECK(ncclNet_v6->init(logfn)); ncclNet.devices = ncclNet_v6->devices; ncclNet.getProperties = ncclNet_getProperties; - ncclNet.listen = ncclNet_v6->listen; + ncclNet.listen = ncclNet_listen; ncclNet.connect = ncclNet_connect; ncclNet.accept = ncclNet_accept; ncclNet.regMr = ncclNet_regMr; @@ -130,6 +172,8 @@ static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t ncclNet.getDeviceMr = NULL; ncclNet.irecvConsumed = NULL; ncclNet.makeVDevice = NULL; + ncclNet.finalize = ncclNet_finalize; + ncclNet.setNetAttr = nullptr; return ncclSuccess; } @@ -141,15 +185,20 @@ ncclNet_t* getNcclNet_v6(void* lib) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNet_v6->name); return &ncclNet; } - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v6 symbol."); return nullptr; } -static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) { +static ncclResult_t ncclCollNet_init(void** ctx __attribute__((unused)), + uint64_t commId __attribute__((unused)), + ncclDebugLogger_t logfn) { + // before ncclCollNet_v11 the collnet plugin was initialized only once. With ncclCollNet_v11 this is no longer the case. + // The compat layer preserves the ncclCollNet_v6 behavior using a refCount to track the number of times the plugin + // is initialized, and avoid initializing it multiple times. + if (refCount[COLLNET_INDEX]++) return ncclSuccess; NCCLCHECK(ncclCollNet_v6->init(logfn)); ncclCollNet.devices = ncclCollNet_v6->devices; ncclCollNet.getProperties = ncclCollNet_getProperties; - ncclCollNet.listen = ncclCollNet_v6->listen; + ncclCollNet.listen = ncclCollNet_listen; ncclCollNet.connect = ncclCollNet_v6->connect; ncclCollNet.reduceSupport = ncclCollNet_v6->reduceSupport; ncclCollNet.regMr = ncclCollNet_regMr; @@ -162,6 +211,8 @@ static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) { ncclCollNet.test = ncclCollNet_v6->test; ncclCollNet.closeColl = ncclCollNet_v6->closeColl; ncclCollNet.closeListen = ncclCollNet_v6->closeListen; + ncclCollNet.makeVDevice = NULL; + ncclCollNet.finalize = ncclCollNet_finalize; return ncclSuccess; } @@ -173,6 +224,5 @@ ncclCollNet_t* getNcclCollNet_v6(void* lib) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNet_v6->name); return &ncclCollNet; } - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol."); return nullptr; } diff --git a/src/plugin/net/net_v7.cc b/src/plugin/net/net_v7.cc index 4bad5ec26..a13717294 100644 --- a/src/plugin/net/net_v7.cc +++ b/src/plugin/net/net_v7.cc @@ -8,12 +8,18 @@ #include "net_device.h" #include "proxy.h" #include "checks.h" +#include static ncclNet_t ncclNet; static ncclCollNet_t ncclCollNet; static ncclNet_v7_t* ncclNet_v7; static ncclCollNet_v7_t* ncclCollNet_v7; +#define NET_INDEX 0 +#define COLLNET_INDEX 1 +#define INDEX_NUMS 2 +static int refCount[INDEX_NUMS]; + static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { ncclNetProperties_v7_t p7; ncclResult_t ans = ncclNet_v7->getProperties(dev, &p7); @@ -35,10 +41,18 @@ static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { props->vProps.devs[0] = dev; props->maxP2pBytes = MAX_NET_SIZE; props->maxCollBytes = MAX_COLLNET_SIZE; + props->maxMultiRequestSize = 1; return ncclSuccess; } -static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { +static ncclResult_t ncclNet_listen(void* ctx __attribute__((unused)), + int dev, void* handle, void** listenComm) { + return ncclNet_v7->listen(dev, handle, listenComm); +} + +static ncclResult_t ncclNet_connect(void* ctx __attribute__((unused)), + int dev, + void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclNet_v7->connect(dev, handle, sendComm, sendDevComm); } @@ -47,7 +61,9 @@ static ncclResult_t ncclNet_regMr(void* comm, void* data, size_t size, int type, return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle); } -static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) { +static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, + void* pHandle __attribute__((unused)), + void** request) { int sizeInt; if (size > MAX_NET_SIZE) return ncclInternalError; sizeInt = (int)size; @@ -55,7 +71,9 @@ static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int t return ans; } -static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) { +static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, + void** pHandles __attribute__((unused)), + void** request) { int sizesInt[NCCL_PROXY_MAX_SUBS]; //reset to nullptr if optional receive completion is set if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr; @@ -67,6 +85,11 @@ static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* si return ans; } +static ncclResult_t ncclNet_finalize(void* ctx __attribute__((unused))) { + refCount[NET_INDEX]--; + return ncclSuccess; +} + static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) { ncclNetProperties_v7_t p7; ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7); @@ -88,9 +111,15 @@ static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* prop props->vProps.devs[0] = dev; props->maxP2pBytes = MAX_NET_SIZE; props->maxCollBytes = MAX_COLLNET_SIZE; + props->maxMultiRequestSize = 1; return ncclSuccess; } +static ncclResult_t ncclCollNet_listen(void* ctx __attribute__((unused)), + int d, void* handle, void** listenComm) { + return ncclCollNet_v7->listen(d, handle, listenComm); +} + static ncclResult_t ncclCollNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) { if (size >= 1UL<<31) return ncclInternalError; return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle); @@ -106,11 +135,24 @@ static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* return ans; } -static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) { +static ncclResult_t ncclCollNet_finalize(void* ctx __attribute__((unused))) { + refCount[COLLNET_INDEX]--; + return ncclSuccess; +} + +static ncclResult_t ncclNet_init(void** ctx __attribute__((unused)), + uint64_t commId __attribute__((unused)), + ncclNetCommConfig_t* config __attribute__((unused)), + ncclDebugLogger_t logfn, + ncclProfilerCallback_t proffn __attribute__((unused))) { + // before ncclNet_v11 the net plugin was initialized only once. With ncclNet_v11 this is no longer the case. + // The compat layer preserves the ncclNet_v7 behavior using a refCount to track the number of times the plugin + // is initialized, and avoid initializing it multiple times. + if (refCount[NET_INDEX]++) return ncclSuccess; NCCLCHECK(ncclNet_v7->init(logfn)); ncclNet.devices = ncclNet_v7->devices; ncclNet.getProperties = ncclNet_getProperties; // ncclNet_v5->getProperties; - ncclNet.listen = ncclNet_v7->listen; + ncclNet.listen = ncclNet_listen; ncclNet.connect = ncclNet_connect; ncclNet.accept = ncclNet_v7->accept; ncclNet.regMr = ncclNet_regMr; @@ -126,6 +168,8 @@ static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t ncclNet.getDeviceMr = ncclNet_v7->getDeviceMr; ncclNet.irecvConsumed = ncclNet_v7->irecvConsumed; ncclNet.makeVDevice = NULL; + ncclNet.finalize = ncclNet_finalize; + ncclNet.setNetAttr = nullptr; return ncclSuccess; } @@ -137,15 +181,20 @@ ncclNet_t* getNcclNet_v7(void* lib) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNet_v7->name); return &ncclNet; } - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v7 symbol."); return nullptr; } -static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) { +static ncclResult_t ncclCollNet_init(void** ctx __attribute__((unused)), + uint64_t commId __attribute__((unused)), + ncclDebugLogger_t logfn) { + // before ncclCollNet_v11 the collnet plugin was initialized only once. With ncclCollNet_v11 this is no longer the case. + // The compat layer preserves the ncclCollNet_v7 behavior using a refCount to track the number of times the plugin + // is initialized, and avoid initializing it multiple times. + if (refCount[COLLNET_INDEX]++) return ncclSuccess; NCCLCHECK(ncclCollNet_v7->init(logfn)); ncclCollNet.devices = ncclCollNet_v7->devices; ncclCollNet.getProperties = ncclCollNet_getProperties; - ncclCollNet.listen = ncclCollNet_v7->listen; + ncclCollNet.listen = ncclCollNet_listen; ncclCollNet.connect = ncclCollNet_v7->connect; ncclCollNet.reduceSupport = ncclCollNet_v7->reduceSupport; ncclCollNet.regMr = ncclCollNet_regMr; @@ -158,6 +207,7 @@ static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) { ncclCollNet.test = ncclCollNet_v7->test; ncclCollNet.closeColl = ncclCollNet_v7->closeColl; ncclCollNet.closeListen = ncclCollNet_v7->closeListen; + ncclCollNet.finalize = ncclCollNet_finalize; return ncclSuccess; } @@ -169,6 +219,5 @@ ncclCollNet_t* getNcclCollNet_v7(void* lib) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNet_v7->name); return &ncclCollNet; } - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v7 symbol."); return nullptr; } diff --git a/src/plugin/net/net_v8.cc b/src/plugin/net/net_v8.cc index b43bb895e..d241d5dc5 100644 --- a/src/plugin/net/net_v8.cc +++ b/src/plugin/net/net_v8.cc @@ -8,12 +8,18 @@ #include "net_device.h" #include "proxy.h" #include "checks.h" +#include static ncclNet_t ncclNet; static ncclCollNet_t ncclCollNet; static ncclNet_v8_t* ncclNet_v8; static ncclCollNet_v8_t* ncclCollNet_v8; +#define NET_INDEX 0 +#define COLLNET_INDEX 1 +#define INDEX_NUMS 2 +static int refCount[INDEX_NUMS]; + static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { ncclNetProperties_v8_t p8; ncclResult_t ans = ncclNet_v8->getProperties(dev, &p8); @@ -35,14 +41,24 @@ static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { props->vProps.devs[0] = dev; props->maxP2pBytes = MAX_NET_SIZE; props->maxCollBytes = MAX_COLLNET_SIZE; + props->maxMultiRequestSize = 1; return ncclSuccess; } -static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { +static ncclResult_t ncclNet_listen(void* ctx __attribute__((unused)), + int dev, void* handle, void** listenComm) { + return ncclNet_v8->listen(dev, handle, listenComm); +} + +static ncclResult_t ncclNet_connect(void* ctx __attribute__((unused)), + int dev, + void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclNet_v8->connect(dev, handle, sendComm, sendDevComm); } -static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) { +static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, + void* pHandle __attribute__((unused)), + void** request) { int sizeInt; if (size > MAX_NET_SIZE) return ncclInternalError; sizeInt = (int)size; @@ -50,7 +66,9 @@ static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int t return ans; } -static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) { +static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, + void** pHandles __attribute__((unused)), + void** request) { int sizesInt[NCCL_PROXY_MAX_SUBS]; //reset to nullptr if optional receive completion is set if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr; @@ -62,6 +80,11 @@ static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* si return ans; } +static ncclResult_t ncclNet_finalize(void* ctx __attribute__((unused))) { + refCount[NET_INDEX]--; + return ncclSuccess; +} + static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) { ncclNetProperties_v8_t p8; ncclResult_t ans = ncclCollNet_v8->getProperties(dev, &p8); @@ -83,9 +106,15 @@ static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* prop props->vProps.devs[0] = dev; props->maxP2pBytes = MAX_NET_SIZE; props->maxCollBytes = MAX_COLLNET_SIZE; + props->maxMultiRequestSize = 1; return ncclSuccess; } +static ncclResult_t ncclCollNet_listen(void* ctx __attribute__((unused)), + int dev, void* handle, void** listenComm) { + return ncclCollNet_v8->listen(dev, handle, listenComm); +} + static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* recvData, size_t count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { int countInt; @@ -128,11 +157,23 @@ static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, n return ans; } -static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) { +static ncclResult_t ncclCollNet_finalize(void* ctx __attribute__((unused))) { + refCount[COLLNET_INDEX]--; + return ncclSuccess; +} + +static ncclResult_t ncclNet_init(void** ctx __attribute__((unused)), + uint64_t commId __attribute__((unused)), + ncclNetCommConfig_t* config __attribute__((unused)), + ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) { + // before ncclNet_v11 the net plugin was initialized only once. With ncclNet_v11 this is no longer the case. + // The compat layer preserves the ncclNet_v8 behavior using a refCount to track the number of times the plugin + // is initialized, and avoid initializing it multiple times. + if (refCount[NET_INDEX]++) return ncclSuccess; NCCLCHECK(ncclNet_v8->init(logfn)); ncclNet.devices = ncclNet_v8->devices; ncclNet.getProperties = ncclNet_getProperties; - ncclNet.listen = ncclNet_v8->listen; + ncclNet.listen = ncclNet_listen; ncclNet.connect = ncclNet_connect; ncclNet.accept = ncclNet_v8->accept; ncclNet.regMr = ncclNet_v8->regMr; @@ -148,6 +189,8 @@ static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t ncclNet.getDeviceMr = ncclNet_v8->getDeviceMr; ncclNet.irecvConsumed = ncclNet_v8->irecvConsumed; ncclNet.makeVDevice = NULL; + ncclNet.finalize = ncclNet_finalize; + ncclNet.setNetAttr = nullptr; return ncclSuccess; } @@ -159,15 +202,20 @@ ncclNet_t* getNcclNet_v8(void* lib) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v8)", ncclNet_v8->name); return &ncclNet; } - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v8 symbol."); return nullptr; } -static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) { +static ncclResult_t ncclCollNet_init(void** ctx __attribute__((unused)), + uint64_t commId __attribute__((unused)), + ncclDebugLogger_t logfn) { + // before ncclCollNet_v11 the collnet plugin was initialized only once. With ncclCollNet_v11 this is no longer the case. + // The compat layer preserves the ncclCollNet_v8 behavior using a refCount to track the number of times the plugin + // is initialized, and avoid initializing it multiple times. + if (refCount[COLLNET_INDEX]++) return ncclSuccess; NCCLCHECK(ncclCollNet_v8->init(logfn)); ncclCollNet.devices = ncclCollNet_v8->devices; ncclCollNet.getProperties = ncclCollNet_getProperties; - ncclCollNet.listen = ncclCollNet_v8->listen; + ncclCollNet.listen = ncclCollNet_listen; ncclCollNet.connect = ncclCollNet_v8->connect; ncclCollNet.reduceSupport = ncclCollNet_v8->reduceSupport; ncclCollNet.regMr = ncclCollNet_v8->regMr; @@ -180,6 +228,8 @@ static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) { ncclCollNet.test = ncclCollNet_v8->test; ncclCollNet.closeColl = ncclCollNet_v8->closeColl; ncclCollNet.closeListen = ncclCollNet_v8->closeListen; + ncclCollNet.makeVDevice = nullptr; + ncclCollNet.finalize = ncclCollNet_finalize; return ncclSuccess; } @@ -191,6 +241,5 @@ ncclCollNet_t* getNcclCollNet_v8(void* lib) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v8)", ncclCollNet_v8->name); return &ncclCollNet; } - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol."); return nullptr; } diff --git a/src/plugin/net/net_v9.cc b/src/plugin/net/net_v9.cc index 34e039332..12011aa8c 100644 --- a/src/plugin/net/net_v9.cc +++ b/src/plugin/net/net_v9.cc @@ -8,25 +8,64 @@ #include "net_device.h" #include "proxy.h" #include "checks.h" +#include static ncclNet_t ncclNet; static ncclCollNet_t ncclCollNet; static ncclNet_v9_t* ncclNet_v9; static ncclCollNet_v9_t* ncclCollNet_v9; +#define NET_INDEX 0 +#define COLLNET_INDEX 1 +#define INDEX_NUMS 2 +static int refCount[INDEX_NUMS]; + static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) { - return ncclNet_v9->getProperties(dev, (ncclNetProperties_v9_t *)props); + ncclNetProperties_v9_t props_v9; + NCCLCHECK(ncclNet_v9->getProperties(dev, &props_v9)); + props->name = props_v9.name; + props->pciPath = props_v9.pciPath; + props->guid = props_v9.guid; + props->ptrSupport = props_v9.ptrSupport; + props->regIsGlobal = props_v9.regIsGlobal; + props->forceFlush = props_v9.forceFlush; + props->speed = props_v9.speed; + props->port = props_v9.port; + props->latency = props_v9.latency; + props->maxComms = props_v9.maxComms; + props->maxRecvs = props_v9.maxRecvs; + props->netDeviceType = props_v9.netDeviceType; + props->netDeviceVersion = props_v9.netDeviceVersion; + props->vProps.ndevs = props_v9.vProps.ndevs; + for (int i = 0; i < props->vProps.ndevs; i++) { + props->vProps.devs[i] = props_v9.vProps.devs[i]; + } + props->maxP2pBytes = props_v9.maxP2pBytes; + props->maxCollBytes = props_v9.maxCollBytes; + props->maxMultiRequestSize = 1; + return ncclSuccess; } -static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) { +static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, + void* pHandle __attribute__((unused)), + void** request) { return ncclNet_v9->isend(sendComm, data, size, tag, mhandle, request); } -static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) { +static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, + void** pHandles __attribute__((unused)), + void** request) { return ncclNet_v9->irecv(recvComm, n, data, sizes, tags, mhandles, request); } -static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { +static ncclResult_t ncclNet_listen(void* ctx __attribute__((unused)), + int dev, void* handle, void** listenComm) { + return ncclNet_v9->listen(dev, handle, listenComm); +} + +static ncclResult_t ncclNet_connect(void* ctx __attribute__((unused)), + int dev, + void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclNet_v9->connect(dev, handle, sendComm, sendDevComm); } @@ -34,8 +73,40 @@ static ncclResult_t ncclNet_makeVDevice(int* d, ncclNetVDeviceProps_t* props) { return ncclNet_v9->makeVDevice(d, (ncclNetVDeviceProps_v9_t*)props); } +static ncclResult_t ncclNet_finalize(void* ctx __attribute__((unused))) { + refCount[NET_INDEX]--; + return ncclSuccess; +} + static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) { - return ncclCollNet_v9->getProperties(dev, (ncclNetProperties_v9_t *)props); + ncclNetProperties_v9_t props_v9; + NCCLCHECK(ncclCollNet_v9->getProperties(dev, &props_v9)); + props->name = props_v9.name; + props->pciPath = props_v9.pciPath; + props->guid = props_v9.guid; + props->ptrSupport = props_v9.ptrSupport; + props->regIsGlobal = props_v9.regIsGlobal; + props->forceFlush = props_v9.forceFlush; + props->speed = props_v9.speed; + props->port = props_v9.port; + props->latency = props_v9.latency; + props->maxComms = props_v9.maxComms; + props->maxRecvs = props_v9.maxRecvs; + props->netDeviceType = props_v9.netDeviceType; + props->netDeviceVersion = props_v9.netDeviceVersion; + props->vProps.ndevs = props_v9.vProps.ndevs; + for (int i = 0; i < props->vProps.ndevs; i++) { + props->vProps.devs[i] = props_v9.vProps.devs[i]; + } + props->maxP2pBytes = props_v9.maxP2pBytes; + props->maxCollBytes = props_v9.maxCollBytes; + props->maxMultiRequestSize = 1; + return ncclSuccess; +} + +static ncclResult_t ncclCollNet_listen(void* ctx __attribute__((unused)), + int d, void* handle, void** listenComm) { + return ncclCollNet_v9->listen(d, handle, listenComm); } static ncclResult_t ncclCollNet_iallgather(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_t* recvParts, @@ -53,11 +124,27 @@ static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, n windowOffset, windowBytes, dataType, redOp, recvMhandle, request); } -static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) { +static ncclResult_t ncclCollNet_makeVDevice(int* d, ncclNetVDeviceProps_t* props) { + return ncclCollNet_v9->makeVDevice(d, (ncclNetVDeviceProps_v9_t *)props); +} + +static ncclResult_t ncclCollNet_finalize(void* ctx __attribute__((unused))) { + refCount[COLLNET_INDEX]--; + return ncclSuccess; +} + +static ncclResult_t ncclNet_init(void** ctx __attribute__((unused)), + uint64_t commId __attribute__((unused)), + ncclNetCommConfig_t* config __attribute__((unused)), + ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) { + // before ncclNet_v11 the net plugin was initialized only once. With ncclNet_v11 this is no longer the case. + // The compat layer preserves the ncclNet_v9 behavior using a refCount to track the number of times the plugin + // is initialized, and avoid initializing it multiple times. + if (refCount[NET_INDEX]++) return ncclSuccess; NCCLCHECK(ncclNet_v9->init(logfn)); ncclNet.devices = ncclNet_v9->devices; ncclNet.getProperties = ncclNet_getProperties; - ncclNet.listen = ncclNet_v9->listen; + ncclNet.listen = ncclNet_listen; ncclNet.connect = ncclNet_connect; ncclNet.accept = ncclNet_v9->accept; ncclNet.regMr = ncclNet_v9->regMr; @@ -73,6 +160,8 @@ static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t ncclNet.getDeviceMr = ncclNet_v9->getDeviceMr; ncclNet.irecvConsumed = ncclNet_v9->irecvConsumed; ncclNet.makeVDevice = (ncclNet_v9->makeVDevice) ? ncclNet_makeVDevice : nullptr; + ncclNet.finalize = ncclNet_finalize; + ncclNet.setNetAttr = nullptr; return ncclSuccess; } @@ -84,15 +173,20 @@ ncclNet_t* getNcclNet_v9(void* lib) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v9)", ncclNet_v9->name); return &ncclNet; } - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v9 symbol."); return nullptr; } -static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) { +static ncclResult_t ncclCollNet_init(void** ctx __attribute__((unused)), + uint64_t commId __attribute__((unused)), + ncclDebugLogger_t logfn) { + // before ncclCollNet_v11 the collnet plugin was initialized only once. With ncclCollNet_v11 this is no longer the case. + // The compat layer preserves the ncclCollNet_v9 behavior using a refCount to track the number of times the plugin + // is initialized, and avoid initializing it multiple times. + if (refCount[COLLNET_INDEX]++) return ncclSuccess; NCCLCHECK(ncclCollNet_v9->init(logfn)); ncclCollNet.devices = ncclCollNet_v9->devices; ncclCollNet.getProperties = ncclCollNet_getProperties; - ncclCollNet.listen = ncclCollNet_v9->listen; + ncclCollNet.listen = ncclCollNet_listen; ncclCollNet.connect = ncclCollNet_v9->connect; ncclCollNet.reduceSupport = ncclCollNet_v9->reduceSupport; ncclCollNet.regMr = ncclCollNet_v9->regMr; @@ -105,6 +199,8 @@ static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) { ncclCollNet.test = ncclCollNet_v9->test; ncclCollNet.closeColl = ncclCollNet_v9->closeColl; ncclCollNet.closeListen = ncclCollNet_v9->closeListen; + ncclCollNet.makeVDevice = (ncclCollNet_v9->makeVDevice) ? ncclCollNet_makeVDevice : nullptr; + ncclCollNet.finalize = ncclCollNet_finalize; return ncclSuccess; } @@ -116,6 +212,5 @@ ncclCollNet_t* getNcclCollNet_v9(void* lib) { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v9)", ncclCollNet_v9->name); return &ncclCollNet; } - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v9 symbol."); return nullptr; } diff --git a/src/plugin/plugin_open.cc b/src/plugin/plugin_open.cc index f80321c81..740f22065 100644 --- a/src/plugin/plugin_open.cc +++ b/src/plugin/plugin_open.cc @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "debug.h" @@ -16,6 +17,7 @@ #define NUM_LIBS 3 static char* libNames[NUM_LIBS]; +char* ncclPluginLibPaths[NUM_LIBS]; static void *libHandles[NUM_LIBS]; static const char *pluginNames[NUM_LIBS] = { "NET", "TUNER", "PROFILER" }; static const char *pluginPrefix[NUM_LIBS] = { "libnccl-net", "libnccl-tuner", "libnccl-profiler" }; @@ -50,6 +52,14 @@ static void appendNameToList(char* nameList, int *leftChars, char* name) { *leftChars -= strlen(name) + 1; } +static char* getLibPath(void* handle) { + struct link_map* lm; + if (dlinfo(handle, RTLD_DI_LINKMAP, &lm) != 0) + return nullptr; + else + return strdup(lm->l_name); +} + static void* openPluginLib(enum ncclPluginType type, const char* libName) { int openErr, len = PATH_MAX; char libName_[MAX_STR_LEN] = { 0 }; @@ -58,49 +68,44 @@ static void* openPluginLib(enum ncclPluginType type, const char* libName) { if (libName && strlen(libName)) { snprintf(libName_, MAX_STR_LEN, "%s", libName); - libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); - if (libHandles[type]) { - INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_); - libNames[type] = strdup(libName_); - return libHandles[type]; - } - if (openErr == ENOENT) { - appendNameToList(eNoEntNameList, &len, libName_); - } else { - INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr); - } - - // libName can't be a relative or absolute path (start with '.' or contain any '/'). It can't be a library name either (start with 'lib' or end with '.so') - if (strchr(libName, '/') == nullptr && (strncmp(libName, "lib", strlen("lib")) || strlen(libName) < strlen(".so") || strncmp(libName + strlen(libName) - strlen(".so"), ".so", strlen(".so")))) { - snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName); - libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); - if (libHandles[type]) { - INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_); - libNames[type] = strdup(libName_); - return libHandles[type]; - } - if (openErr == ENOENT) { - appendNameToList(eNoEntNameList, &len, libName_); - } else { - INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr); - } - } } else { snprintf(libName_, MAX_STR_LEN, "%s.so", pluginPrefix[type]); + } + + libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); + if (libHandles[type]) { + libNames[type] = strdup(libName_); + ncclPluginLibPaths[type] = getLibPath(libHandles[type]); + return libHandles[type]; + } + if (openErr == ENOENT) { + appendNameToList(eNoEntNameList, &len, libName_); + } else { + INFO(subsys[type], "%s/Plugin: %s: %s", pluginNames[type], libName_, openErrStr); + } + + // libName can't be a relative or absolute path (start with '.' or contain any '/'). It can't be a library name either (start with 'lib' or end with '.so') + if (libName && strlen(libName) && strchr(libName, '/') == nullptr && + (strncmp(libName, "lib", strlen("lib")) || strlen(libName) < strlen(".so") || + strncmp(libName + strlen(libName) - strlen(".so"), ".so", strlen(".so")))) { + snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName); + libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr); if (libHandles[type]) { libNames[type] = strdup(libName_); + ncclPluginLibPaths[type] = getLibPath(libHandles[type]); return libHandles[type]; } if (openErr == ENOENT) { appendNameToList(eNoEntNameList, &len, libName_); } else { - INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr); + INFO(subsys[type], "%s/Plugin: %s: %s", pluginNames[type], libName_, openErrStr); } } if (strlen(eNoEntNameList)) { - INFO(subsys[type], "%s/Plugin: Could not find:%s. %s", pluginNames[type], eNoEntNameList, pluginFallback[type]); + INFO(subsys[type], "%s/Plugin: Could not find:%s%s%s", pluginNames[type], eNoEntNameList, + (strlen(pluginFallback[type]) > 0 ? ". " : ""), pluginFallback[type]); } else if (strlen(pluginFallback[type])) { INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], pluginFallback[type]); } @@ -123,6 +128,7 @@ void* ncclGetNetPluginLib(enum ncclPluginType type) { if (libNames[ncclPluginTypeNet]) { // increment the reference counter of the net library libNames[type] = strdup(libNames[ncclPluginTypeNet]); + ncclPluginLibPaths[type] = strdup(ncclPluginLibPaths[ncclPluginTypeNet]); libHandles[type] = dlopen(libNames[ncclPluginTypeNet], RTLD_NOW | RTLD_LOCAL); } return libHandles[type]; @@ -132,6 +138,8 @@ ncclResult_t ncclClosePluginLib(void* handle, enum ncclPluginType type) { if (handle && libHandles[type] == handle) { dlclose(handle); libHandles[type] = nullptr; + free(ncclPluginLibPaths[type]); + ncclPluginLibPaths[type] = nullptr; free(libNames[type]); libNames[type] = nullptr; } diff --git a/src/plugin/profiler.cc b/src/plugin/profiler.cc index 15c3f2bc2..8514db17c 100644 --- a/src/plugin/profiler.cc +++ b/src/plugin/profiler.cc @@ -13,17 +13,22 @@ #include "profiler.h" #include "transport.h" #include "plugin.h" +#include extern ncclProfiler_t* getNcclProfiler_v1(void* lib); extern ncclProfiler_t* getNcclProfiler_v2(void* lib); extern ncclProfiler_t* getNcclProfiler_v3(void* lib); extern ncclProfiler_t* getNcclProfiler_v4(void* lib); +extern ncclProfiler_t* getNcclProfiler_v5(void* lib); -static pthread_mutex_t profilerLock = PTHREAD_MUTEX_INITIALIZER; +static std::mutex profilerMutex; static int profilerPluginRefCount; static void* profilerPluginLib; static ncclProfiler_t* ncclProfiler; +extern __thread int ncclGroupDepth; +__thread ncclProfilerApiState_t ncclProfilerApiState; + #define MAX_STR_LEN 256 enum { @@ -35,22 +40,37 @@ static int profilerPluginStatus = profilerPluginLoadReady; static pid_t pid; static ncclResult_t ncclProfilerPluginLoad(void) { + const char* profilerName; if (profilerPluginLoadFailed == profilerPluginStatus) { return ncclSuccess; } - pthread_mutex_lock(&profilerLock); + std::lock_guard lock(profilerMutex); if (profilerPluginLoadSuccess == profilerPluginStatus) { ++profilerPluginRefCount; goto exit; } - profilerPluginLib = ncclOpenProfilerPluginLib(ncclGetEnv("NCCL_PROFILER_PLUGIN")); + if ((profilerName = ncclGetEnv("NCCL_PROFILER_PLUGIN")) != nullptr) { + INFO(NCCL_ENV, "NCCL_PROFILER_PLUGIN set by environment to %s", profilerName); + if (strcasecmp(profilerName, "none") == 0) + goto fail; + } + profilerPluginLib = ncclOpenProfilerPluginLib(profilerName); if (profilerPluginLib == nullptr) { - goto fail; + profilerPluginLib = ncclGetNetPluginLib(ncclPluginTypeProfiler); + if (nullptr == profilerPluginLib) { + goto fail; + } + profilerName = nullptr; + } else if (ncclPluginLibPaths[ncclPluginTypeProfiler]) { + profilerName = ncclPluginLibPaths[ncclPluginTypeProfiler]; } - ncclProfiler = getNcclProfiler_v4(profilerPluginLib); + ncclProfiler = getNcclProfiler_v5(profilerPluginLib); + if (ncclProfiler == nullptr) { + ncclProfiler = getNcclProfiler_v4(profilerPluginLib); + } if (ncclProfiler == nullptr) { ncclProfiler = getNcclProfiler_v3(profilerPluginLib); } @@ -61,8 +81,10 @@ static ncclResult_t ncclProfilerPluginLoad(void) { ncclProfiler = getNcclProfiler_v1(profilerPluginLib); } if (ncclProfiler == NULL) { + if (profilerName) INFO(NCCL_INIT, "External profiler plugin %s is unsupported", profilerName); goto fail; } + if (profilerName) INFO(NCCL_INIT, "Successfully loaded external profiler plugin %s", profilerName); ++profilerPluginRefCount; profilerPluginStatus = profilerPluginLoadSuccess; @@ -74,7 +96,6 @@ static ncclResult_t ncclProfilerPluginLoad(void) { pid = getpid(); exit: - pthread_mutex_unlock(&profilerLock); return ncclSuccess; fail: if (profilerPluginLib) NCCLCHECK(ncclClosePluginLib(profilerPluginLib, ncclPluginTypeProfiler)); @@ -84,15 +105,16 @@ static ncclResult_t ncclProfilerPluginLoad(void) { } static ncclResult_t ncclProfilerPluginUnload(void) { - pthread_mutex_lock(&profilerLock); + std::lock_guard lock(profilerMutex); if (0 == (--profilerPluginRefCount)) { - INFO(NCCL_ENV, "PROFILER/Plugin: Closing profiler plugin %s", ncclProfiler->name); + if (__builtin_expect(ncclProfiler != NULL, 0)) { + INFO(NCCL_INIT, "PROFILER/Plugin: Closing profiler plugin %s", ncclProfiler->name); + } NCCLCHECK(ncclClosePluginLib(profilerPluginLib, ncclPluginTypeProfiler)); profilerPluginLib = nullptr; ncclProfiler = nullptr; profilerPluginStatus = profilerPluginLoadReady; } - pthread_mutex_unlock(&profilerLock); return ncclSuccess; } @@ -167,10 +189,9 @@ ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm) { TIME_START_EVENT(init); ncclProfilerPluginLoad(); if (__builtin_expect(ncclProfiler != NULL, 0)) { - int err = ncclProfiler->init(&comm->profilerContext, &ncclProfilerEventMask, comm->config.commName, comm->commHash, comm->nNodes, comm->nRanks, comm->rank, ncclDebugLog); + int err = ncclProfiler->init(&comm->profilerContext, comm->commHash, &ncclProfilerEventMask, comm->config.commName, comm->nNodes, comm->nRanks, comm->rank, ncclDebugLog); if (err) { - WARN("Profiler init failed with error (%d). Continue without profiler.", err); - ncclProfiler = NULL; + INFO(NCCL_INIT, "Profiler init failed with error '%d': %s. Continue without profiler.", err, strerror(errno)); } } TIME_STOP_EVENT(init); @@ -179,7 +200,7 @@ ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm) { ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm) { TIME_START_EVENT(finalize); - if (__builtin_expect(ncclProfiler != NULL, 0)) { + if (__builtin_expect(ncclProfiler != NULL, 0) && comm->profilerContext) { ncclProfiler->finalize(comm->profilerContext); } ncclProfilerPluginUnload(); @@ -189,6 +210,143 @@ ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm) { return ncclSuccess; } +ncclResult_t ncclProfilerStartGroupApiEvent(struct ncclInfo* info, bool isGraphCaptured) { + ncclProfilerEventDescr_t eDescr = { 0 }; + eDescr.type = ncclProfileGroupApi; + eDescr.groupApi.graphCaptured = isGraphCaptured; + + ncclProfilerApiState.eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED); + int groupApiMask = ncclProfileGroupApi | ncclProfileP2pApi | ncclProfileCollApi | ncclProfileKernelLaunch | ncclProfileGroup | ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin; + // Only count outermost groups when emitting group API events + if (__builtin_expect(ncclProfiler != NULL, 0) && (ncclProfilerApiState.eActivationMask & groupApiMask)) { + if (ncclProfilerApiState.profilerGroupDepth == 0) { + eDescr.groupApi.groupDepth = ncclGroupDepth; + ncclProfiler->startEvent(info->comm->profilerContext, &ncclProfilerApiState.groupApiEventHandle, &eDescr); + ncclProfilerApiState.profilerGroupDepth = ncclGroupDepth; + ncclProfilerApiState.state = ncclProfilerGroupApiStartStateStarted; + } + } + return ncclSuccess; +} + +ncclResult_t ncclProfilerStopGroupApiEvent() { + void* groupApiEventHandle = ncclProfilerApiState.groupApiEventHandle; + if (__builtin_expect(ncclProfiler != NULL, 0) && groupApiEventHandle && ncclProfilerApiState.profilerGroupDepth == 0) { + ncclProfiler->stopEvent(groupApiEventHandle); + ncclProfilerApiState.groupApiEventHandle = nullptr; + } + return ncclSuccess; +} + +ncclResult_t ncclProfilerRecordGroupApiEventState(ncclProfilerEventState_t eState) { + void* groupApiEventHandle = ncclProfilerApiState.groupApiEventHandle; + bool shouldRecord = false; + if (eState == ncclProfilerGroupStartApiStop && ncclProfilerApiState.state == ncclProfilerGroupApiStartStateStarted) { + ncclProfilerApiState.state = ncclProfilerGroupApiStartStateStopped; + shouldRecord = true; + } else if (eState == ncclProfilerGroupEndApiStart && ncclProfilerApiState.state == ncclProfilerGroupApiStartStateStopped) { + ncclProfilerApiState.state = ncclProfilerGroupApiStartStateReset; + shouldRecord = true; + } + + if (__builtin_expect(ncclProfiler != NULL, 0) && groupApiEventHandle && shouldRecord) { + ncclProfiler->recordEventState(groupApiEventHandle, eState, NULL); + } + return ncclSuccess; +} + +ncclResult_t ncclProfilerStartP2pApiEvent(struct ncclInfo *info, bool isGraphCaptured) { + ncclProfilerEventDescr_t eDescr = { 0 }; + eDescr.type = ncclProfileP2pApi; + eDescr.parentObj = ncclProfilerApiState.groupApiEventHandle; + eDescr.p2pApi.func = ncclFuncToString(info->coll); + eDescr.p2pApi.count = info->count; + eDescr.p2pApi.datatype = ncclDatatypeToString(info->datatype); + eDescr.p2pApi.stream = (void *) info->stream; + eDescr.p2pApi.graphCaptured = isGraphCaptured; + int p2pApiMask = ncclProfileP2pApi | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin; + if (__builtin_expect(ncclProfiler != NULL, 0) && (ncclProfilerApiState.eActivationMask & p2pApiMask)) { + ncclProfiler->startEvent(info->comm->profilerContext, &ncclProfilerApiState.p2pApiEventHandle, &eDescr); + } + return ncclSuccess; +} + +ncclResult_t ncclProfilerStopP2pApiEvent() { + if (__builtin_expect(ncclProfiler != NULL, 0) && ncclProfilerApiState.p2pApiEventHandle) { + ncclProfiler->stopEvent(ncclProfilerApiState.p2pApiEventHandle); + ncclProfilerApiState.p2pApiEventHandle = nullptr; + } + return ncclSuccess; +} + +ncclResult_t ncclProfilerStartCollApiEvent(struct ncclInfo *info, bool isGraphCaptured) { + ncclProfilerEventDescr_t eDescr = { 0 }; + eDescr.type = ncclProfileCollApi; + eDescr.parentObj = ncclProfilerApiState.groupApiEventHandle; + eDescr.collApi.func = ncclFuncToString(info->coll); + eDescr.collApi.count = info->count; + eDescr.collApi.datatype = ncclDatatypeToString(info->datatype); + eDescr.collApi.stream = (void *) info->stream; + eDescr.collApi.root = info->root; + eDescr.collApi.graphCaptured = isGraphCaptured; + int collApiMask = ncclProfileCollApi | ncclProfileColl | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin; + if (__builtin_expect(ncclProfiler != NULL, 0) && (ncclProfilerApiState.eActivationMask & collApiMask)) { + ncclProfiler->startEvent(info->comm->profilerContext, &ncclProfilerApiState.collApiEventHandle, &eDescr); + } + return ncclSuccess; +} + +ncclResult_t ncclProfilerStopCollApiEvent() { + if (__builtin_expect(ncclProfiler != NULL, 0) && ncclProfilerApiState.collApiEventHandle) { + ncclProfiler->stopEvent(ncclProfilerApiState.collApiEventHandle); + } + return ncclSuccess; +} + +ncclResult_t ncclProfilerStartKernelLaunchEvent(struct ncclKernelPlan* plan, cudaStream_t stream) { + ncclProfilerEventDescr_t eDescr = { 0 }; + if (__builtin_expect(ncclProfiler != NULL, 0)) { + void* groupApiEventHandle = NULL; + // Check if any collective in the plan has a set event activation mask + struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue); + struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue); + int eActivationMask_ = 0; + while (ct) { + if (ct->eActivationMask) { + eActivationMask_ = ct->eActivationMask; + groupApiEventHandle = ct->groupApiEventHandle; + goto startKernelLaunchEvent; + } + ct = ct->next; + } + // Check if any pt2pt in the plan has a set event activation mask + while (pt) { + if (pt->eActivationMask) { + eActivationMask_ = pt->eActivationMask; + groupApiEventHandle = pt->groupApiEventHandle; + goto startKernelLaunchEvent; + } + pt = pt->next; + } + + startKernelLaunchEvent: + if (eActivationMask_ & ncclProfileKernelLaunch) { + eDescr.type = ncclProfileKernelLaunch; + eDescr.parentObj = groupApiEventHandle; + eDescr.kernelLaunch.stream = (void *) stream; + ncclProfiler->startEvent(plan->comm->profilerContext, &plan->kernelLaunchEventHandle, &eDescr); + } + } + return ncclSuccess; +} + +ncclResult_t ncclProfilerStopKernelLaunchEvent(struct ncclKernelPlan* plan) { + if (__builtin_expect(ncclProfiler != NULL, 0) && plan->kernelLaunchEventHandle) { + ncclProfiler->stopEvent(plan->kernelLaunchEventHandle); + } + return ncclSuccess; +} + ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan) { TIME_START_EVENT(groupStart); if (__builtin_expect(ncclProfiler != NULL, 0)) { @@ -237,26 +395,25 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) { struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue); while (ct) { if (__builtin_expect(ncclProfiler != NULL, 0)) { - if (plan->groupEventHandle) { - int enable = ct->eActivationMask & (ncclProfileColl | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin); - if (enable) { - ncclProfilerEventDescr_t eDescr = { 0 }; - eDescr.type = ncclProfileColl; - eDescr.parentObj = plan->groupEventHandle; - eDescr.rank = plan->comm->rank; - eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func]; - eDescr.coll.func = ncclFuncToString(ct->func); - eDescr.coll.sendBuff = ct->sendbuff; - eDescr.coll.recvBuff = ct->recvbuff; - eDescr.coll.count = ct->count; - eDescr.coll.root = ct->root; - eDescr.coll.datatype = ncclDatatypeToString(ct->datatype); - eDescr.coll.nChannels = ct->nChannels; - eDescr.coll.nWarps = ct->nWarps; - eDescr.coll.algo = ncclAlgoToString(ct->algorithm); - eDescr.coll.proto = ncclProtoToString(ct->protocol); - ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr); - } + int enable = ct->eActivationMask & (ncclProfileColl | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin); + if (enable) { + ncclProfilerEventDescr_t eDescr = { 0 }; + eDescr.type = ncclProfileColl; + eDescr.coll.parentGroup = plan->groupEventHandle; + eDescr.parentObj = ct->collApiEventHandle; + eDescr.rank = plan->comm->rank; + eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func]; + eDescr.coll.func = ncclFuncToString(ct->func); + eDescr.coll.sendBuff = ct->sendbuff; + eDescr.coll.recvBuff = ct->recvbuff; + eDescr.coll.count = ct->count; + eDescr.coll.root = ct->root; + eDescr.coll.datatype = ncclDatatypeToString(ct->datatype); + eDescr.coll.nChannels = ct->nChannels; + eDescr.coll.nWarps = ct->nWarps; + eDescr.coll.algo = ncclAlgoToString(ct->algorithm); + eDescr.coll.proto = ncclProtoToString(ct->protocol); + ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr); } } // comm->seqNumber values are updated even if the plugin is not active, since they are used by RAS as well. @@ -265,31 +422,30 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) { // reports from RAS. Instead, we choose not to include graph-captured collectives in our counts. An exception is // made if ncclProfileKernelCh profiler events are active, as they result in proxy events always being added, which // gives the consistency. - if (!plan->persistent || (__builtin_expect(ncclProfiler != NULL, 0) && plan->groupEventHandle && + if (!plan->persistent || (__builtin_expect(ncclProfiler != NULL, 0) && (plan->groupEventHandle || ct->collApiEventHandle) && (ct->eActivationMask & ncclProfileKernelCh))) __atomic_fetch_add(&plan->comm->seqNumber[ct->func], 1, __ATOMIC_RELAXED); ct = ct->next; } if (__builtin_expect(ncclProfiler != NULL, 0)) { - if (plan->groupEventHandle) { - struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue); - while (pt) { - int enable = pt->eActivationMask & (ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh); - if (enable) { - ncclProfilerEventDescr_t eDescr = { 0 }; - eDescr.type = ncclProfileP2p; - eDescr.parentObj = plan->groupEventHandle; - eDescr.rank = plan->comm->rank; - eDescr.p2p.func = ncclFuncToString(pt->func); - eDescr.p2p.buff = pt->buff; - eDescr.p2p.count = pt->count; - eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype); - eDescr.p2p.peer = pt->root; - eDescr.p2p.nChannels = pt->nChannels; - ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr); - } - pt = pt->next; + struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue); + while (pt) { + int enable = pt->eActivationMask & (ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin); + if (enable) { + ncclProfilerEventDescr_t eDescr = { 0 }; + eDescr.type = ncclProfileP2p; + eDescr.p2p.parentGroup = plan->groupEventHandle; + eDescr.parentObj = pt->p2pApiEventHandle; + eDescr.rank = plan->comm->rank; + eDescr.p2p.func = ncclFuncToString(pt->func); + eDescr.p2p.buff = pt->buff; + eDescr.p2p.count = pt->count; + eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype); + eDescr.p2p.peer = pt->root; + eDescr.p2p.nChannels = pt->nChannels; + ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr); } + pt = pt->next; } } TIME_STOP_EVENT(taskStart); @@ -299,17 +455,15 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) { ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) { TIME_START_EVENT(taskStop); if (__builtin_expect(ncclProfiler != NULL, 0)) { - if (plan->groupEventHandle) { - struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue); - while (ct) { - if (ct->eventHandle) ncclProfiler->stopEvent(ct->eventHandle); - ct = ct->next; - } - struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue); - while (pt) { - if (pt->eventHandle) ncclProfiler->stopEvent(pt->eventHandle); - pt = pt->next; - } + struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue); + while (ct) { + if (ct->eventHandle) ncclProfiler->stopEvent(ct->eventHandle); + ct = ct->next; + } + struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue); + while (pt) { + if (pt->eventHandle) ncclProfiler->stopEvent(pt->eventHandle); + pt = pt->next; } } TIME_STOP_EVENT(taskStop); @@ -357,18 +511,18 @@ ncclResult_t ncclProfilerStopProxyOpEvent(int s, struct ncclProxyArgs* args) { ncclResult_t ncclProfilerStartSendProxyStepEvent(int s, struct ncclProxyArgs* args, int stepId) { TIME_START_EVENT(proxyStepStart); struct ncclProxySubArgs* sub = &args->subs[s]; + int step_ = DIVUP(stepId, args->sliceSteps); if (__builtin_expect(ncclProfiler != NULL, 0)) { - if (sub->opEventHandle && (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileNetPlugin))) { - int step_ = DIVUP(stepId, args->sliceSteps); + if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileNetPlugin)) { ncclProfilerEventDescr_t eDescr = { 0 }; eDescr.type = ncclProfileProxyStep; eDescr.parentObj = sub->opEventHandle; eDescr.rank = sub->rank; eDescr.proxyStep.step = step_; ncclProfiler->startEvent(sub->profilerContext, &sub->pHandles[step_%NCCL_STEPS].stepEventHandle, &eDescr); - sub->pHandles[step_%NCCL_STEPS].subArgPtr = sub; } } + sub->pHandles[step_%NCCL_STEPS].subArgPtr = sub; TIME_STOP_EVENT(proxyStepStart); return ncclSuccess; } @@ -376,18 +530,18 @@ ncclResult_t ncclProfilerStartSendProxyStepEvent(int s, struct ncclProxyArgs* ar ncclResult_t ncclProfilerStartRecvProxyStepEvent(int s, struct ncclProxyArgs* args, int stepId) { TIME_START_EVENT(proxyStepStart); struct ncclProxySubArgs* sub = &args->subs[s]; + int step_ = DIVUP(stepId, args->sliceSteps); if (__builtin_expect(ncclProfiler != NULL, 0)) { - if (sub->opEventHandle && (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileNetPlugin))) { - int step_ = DIVUP(stepId, args->sliceSteps); + if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileNetPlugin)) { ncclProfilerEventDescr_t eDescr = { 0 }; eDescr.type = ncclProfileProxyStep; eDescr.parentObj = sub->opEventHandle; eDescr.rank = sub->rank; eDescr.proxyStep.step = step_; ncclProfiler->startEvent(sub->profilerContext, &sub->pHandles[step_%NCCL_STEPS].stepEventHandle, &eDescr); - sub->pHandles[step_%NCCL_STEPS].subArgPtr = sub; } } + sub->pHandles[step_%NCCL_STEPS].subArgPtr = sub; TIME_STOP_EVENT(proxyStepStart); return ncclSuccess; } @@ -503,11 +657,11 @@ ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op) { return ncclSuccess; } -static pthread_mutex_t proxyProfilerConnectLock = PTHREAD_MUTEX_INITIALIZER; +static std::mutex proxyProfilerConnectMutex; static ncclResult_t proxyProfilerConnect(struct ncclComm* comm, struct ncclProxyOp* op) { ncclResult_t ret = ncclSuccess; - pthread_mutex_lock(&proxyProfilerConnectLock); + std::lock_guard lock(proxyProfilerConnectMutex); if (comm->profiler.initialized) goto exit; for (int c = 0; c < MAXCHANNELS; c++) { NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_PROFILER, 0, comm->rank, &comm->profiler.sendProxyConn[c]), ret, exit); @@ -517,7 +671,6 @@ static ncclResult_t proxyProfilerConnect(struct ncclComm* comm, struct ncclProxy } comm->profiler.initialized = true; exit: - pthread_mutex_unlock(&proxyProfilerConnectLock); return ret; } diff --git a/src/plugin/profiler/CMakeLists.txt b/src/plugin/profiler/CMakeLists.txt new file mode 100644 index 000000000..1a5cc9a30 --- /dev/null +++ b/src/plugin/profiler/CMakeLists.txt @@ -0,0 +1,11 @@ +# Profiler plugin sources +set(PLUGIN_PROFILER_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/profiler_v3.cc + ${CMAKE_CURRENT_SOURCE_DIR}/profiler_v4.cc + ${CMAKE_CURRENT_SOURCE_DIR}/profiler_v1.cc + ${CMAKE_CURRENT_SOURCE_DIR}/profiler_v2.cc + ${CMAKE_CURRENT_SOURCE_DIR}/profiler_v5.cc +) + +# Add profiler plugin sources to parent scope +set(PLUGIN_PROFILER_SOURCES ${PLUGIN_PROFILER_SOURCES} PARENT_SCOPE) diff --git a/src/plugin/profiler/profiler_v1.cc b/src/plugin/profiler/profiler_v1.cc index 2126afc68..ef8ef6b5d 100644 --- a/src/plugin/profiler/profiler_v1.cc +++ b/src/plugin/profiler/profiler_v1.cc @@ -7,6 +7,7 @@ #include "comm.h" #include "nccl_profiler.h" #include "checks.h" +#include static ncclProfiler_t ncclProfiler; static ncclProfiler_v1_t* ncclProfiler_v1; @@ -63,6 +64,7 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP case ncclProfileColl: { eDescr_v1.coll.name = nullptr; // removed in v4 eDescr_v1.coll.commHash = 0; // removed in v4 + eDescr_v1.parentObj = eDescr->coll.parentGroup; // Hierarchy changed in v5 eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber; eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func); eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff; @@ -80,6 +82,7 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP case ncclProfileP2p: { eDescr_v1.p2p.name = nullptr; // removed in v4 eDescr_v1.p2p.commHash = 0; // removed in v4 + eDescr_v1.parentObj = eDescr->p2p.parentGroup; // Hierarchy changed in v5 eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func); eDescr_v1.p2p.buff = eDescr->p2p.buff; eDescr_v1.p2p.count = eDescr->p2p.count; @@ -125,7 +128,15 @@ static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEve return ncclProfiler_v1->recordEventState(eHandle, eState, &args); } -static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) { +static ncclResult_t ncclProfiler_init(void** context, + uint64_t commId __attribute__((unused)), + int* eActivationMask __attribute__((unused)), + const char* commName __attribute__((unused)), + int nNodes __attribute__((unused)), + int nranks __attribute__((unused)), + int rank __attribute__((unused)), + ncclDebugLogger_t logfn __attribute__((unused)) + ) { NCCLCHECK(ncclProfiler_v1->init(context, eActivationMask)); ncclProfiler.startEvent = ncclProfiler_startEvent; ncclProfiler.stopEvent = ncclProfiler_v1->stopEvent; @@ -139,9 +150,8 @@ ncclProfiler_t* getNcclProfiler_v1(void* lib) { if (ncclProfiler_v1) { ncclProfiler.name = ncclProfiler_v1->name; ncclProfiler.init = ncclProfiler_init; - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v1->name); + INFO(NCCL_INIT, "PROFILER/Plugin: Loaded %s (v1)", ncclProfiler_v1->name); return &ncclProfiler; } - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v1."); return NULL; } diff --git a/src/plugin/profiler/profiler_v2.cc b/src/plugin/profiler/profiler_v2.cc index 11e521e90..d1c83cf1a 100644 --- a/src/plugin/profiler/profiler_v2.cc +++ b/src/plugin/profiler/profiler_v2.cc @@ -7,6 +7,7 @@ #include "comm.h" #include "nccl_profiler.h" #include "checks.h" +#include static ncclProfiler_t ncclProfiler; static ncclProfiler_v2_t* ncclProfiler_v2; @@ -20,6 +21,7 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP switch(eDescr->type) { case ncclProfileGroup: break; case ncclProfileColl: { + eDescr_v2.parentObj = eDescr->coll.parentGroup; // Hierarchy changed in v5 eDescr_v2.coll.name = nullptr; // removed in v4 eDescr_v2.coll.commHash = 0; // removed in v4 eDescr_v2.coll.seqNumber = eDescr->coll.seqNumber; @@ -38,6 +40,7 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP case ncclProfileP2p: { eDescr_v2.p2p.name = nullptr; // removed in v4 eDescr_v2.p2p.commHash = 0; // removed in v4 + eDescr_v2.parentObj = eDescr->p2p.parentGroup; // Hierarchy changed in v5 eDescr_v2.p2p.func = eDescr->p2p.func; eDescr_v2.p2p.buff = eDescr->p2p.buff; eDescr_v2.p2p.count = eDescr->p2p.count; @@ -83,7 +86,15 @@ static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEve return ncclProfiler_v2->recordEventState(eHandle, eState, &args); } -static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) { +static ncclResult_t ncclProfiler_init(void** context, + uint64_t commId __attribute__((unused)), + int* eActivationMask __attribute__((unused)), + const char* commName __attribute__((unused)), + int nNodes __attribute__((unused)), + int nranks __attribute__((unused)), + int rank __attribute__((unused)), + ncclDebugLogger_t logfn __attribute__((unused)) + ) { NCCLCHECK(ncclProfiler_v2->init(context, eActivationMask)); ncclProfiler.startEvent = ncclProfiler_startEvent; ncclProfiler.stopEvent = ncclProfiler_v2->stopEvent; @@ -97,9 +108,8 @@ ncclProfiler_t* getNcclProfiler_v2(void* lib) { if (ncclProfiler_v2) { ncclProfiler.name = ncclProfiler_v2->name; ncclProfiler.init = ncclProfiler_init; - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v2->name); + INFO(NCCL_INIT, "PROFILER/Plugin: Loaded %s (v2)", ncclProfiler_v2->name); return &ncclProfiler; } - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v2"); return NULL; } diff --git a/src/plugin/profiler/profiler_v3.cc b/src/plugin/profiler/profiler_v3.cc index 3dba3231a..84ec1468e 100644 --- a/src/plugin/profiler/profiler_v3.cc +++ b/src/plugin/profiler/profiler_v3.cc @@ -7,6 +7,7 @@ #include "comm.h" #include "nccl_profiler.h" #include "checks.h" +#include static ncclProfiler_t ncclProfiler; static ncclProfiler_v3_t* ncclProfiler_v3; @@ -22,6 +23,7 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP case ncclProfileColl: { eDescr_v3.coll.name = nullptr; // removed in v4 eDescr_v3.coll.commHash = 0; // removed in v4 + eDescr_v3.parentObj = eDescr->coll.parentGroup; // Hierarchy changed in v5 eDescr_v3.coll.seqNumber = eDescr->coll.seqNumber; eDescr_v3.coll.func = eDescr->coll.func; eDescr_v3.coll.sendBuff = eDescr->coll.sendBuff; @@ -37,6 +39,7 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP case ncclProfileP2p: { eDescr_v3.p2p.name = nullptr; // removed in v4 eDescr_v3.p2p.commHash = 0; // removed in v4 + eDescr_v3.parentObj = eDescr->p2p.parentGroup; // Hierarchy changed in v5 eDescr_v3.p2p.func = eDescr->p2p.func; eDescr_v3.p2p.buff = eDescr->p2p.buff; eDescr_v3.p2p.count = eDescr->p2p.count; @@ -89,7 +92,15 @@ static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEve return ncclProfiler_v3->recordEventState(eHandle, eState, &args); } -static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) { +static ncclResult_t ncclProfiler_init(void** context, + uint64_t commId __attribute__((unused)), + int* eActivationMask __attribute__((unused)), + const char* commName __attribute__((unused)), + int nNodes __attribute__((unused)), + int nranks __attribute__((unused)), + int rank __attribute__((unused)), + ncclDebugLogger_t logfn __attribute__((unused)) + ) { NCCLCHECK(ncclProfiler_v3->init(context, eActivationMask)); ncclProfiler.startEvent = ncclProfiler_startEvent; ncclProfiler.stopEvent = ncclProfiler_v3->stopEvent; @@ -103,9 +114,8 @@ ncclProfiler_t* getNcclProfiler_v3(void* lib) { if (ncclProfiler_v3) { ncclProfiler.name = ncclProfiler_v3->name; ncclProfiler.init = ncclProfiler_init; - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v3->name); + INFO(NCCL_INIT, "PROFILER/Plugin: Loaded %s (v3)", ncclProfiler_v3->name); return &ncclProfiler; } - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v3"); return NULL; } diff --git a/src/plugin/profiler/profiler_v4.cc b/src/plugin/profiler/profiler_v4.cc index 11bed891a..53b57ce80 100644 --- a/src/plugin/profiler/profiler_v4.cc +++ b/src/plugin/profiler/profiler_v4.cc @@ -7,15 +7,113 @@ #include "comm.h" #include "nccl_profiler.h" #include "checks.h" +#include static ncclProfiler_v4_t* ncclProfiler_v4; +static ncclProfiler_t ncclProfiler; + +static ncclResult_t ncclProfiler_startEvent(void* ctx, void** eHandle, ncclProfilerEventDescr_t* eDescr) { + ncclProfilerEventDescr_v4_t eDescr_v4; + eDescr_v4.type = eDescr->type; + eDescr_v4.parentObj = eDescr->parentObj; + eDescr_v4.rank = eDescr->rank; + switch(eDescr->type) { + case ncclProfileGroup: break; + case ncclProfileColl: { + eDescr_v4.coll.seqNumber = eDescr->coll.seqNumber; + eDescr_v4.coll.func = eDescr->coll.func; + eDescr_v4.coll.sendBuff = eDescr->coll.sendBuff; + eDescr_v4.coll.recvBuff = eDescr->coll.recvBuff; + eDescr_v4.coll.count = eDescr->coll.count; + eDescr_v4.coll.root = eDescr->coll.root; + eDescr_v4.coll.datatype = eDescr->coll.datatype; + eDescr_v4.coll.nChannels = eDescr->coll.nChannels; + eDescr_v4.coll.nWarps = eDescr->coll.nWarps; + eDescr_v4.coll.algo = eDescr->coll.algo; + eDescr_v4.coll.proto = eDescr->coll.proto; + eDescr_v4.parentObj = eDescr->coll.parentGroup; + } break; + case ncclProfileP2p: { + eDescr_v4.p2p.func = eDescr->p2p.func; + eDescr_v4.p2p.buff = eDescr->p2p.buff; + eDescr_v4.p2p.count = eDescr->p2p.count; + eDescr_v4.p2p.datatype = eDescr->p2p.datatype; + eDescr_v4.p2p.peer = eDescr->p2p.peer; + eDescr_v4.parentObj = eDescr->p2p.parentGroup; + } break; + case ncclProfileProxyOp: { + eDescr_v4.proxyOp.pid = eDescr->proxyOp.pid; + eDescr_v4.proxyOp.channelId = eDescr->proxyOp.channelId; + eDescr_v4.proxyOp.peer = eDescr->proxyOp.peer; + eDescr_v4.proxyOp.nSteps = eDescr->proxyOp.nSteps; + eDescr_v4.proxyOp.chunkSize = eDescr->proxyOp.chunkSize; + eDescr_v4.proxyOp.isSend = eDescr->proxyOp.isSend; + } break; + case ncclProfileProxyStep: { + eDescr_v4.proxyStep.step = eDescr->proxyStep.step; + } break; + case ncclProfileProxyCtrl: break; + case ncclProfileKernelCh: { + eDescr_v4.kernelCh.channelId = eDescr->kernelCh.channelId; + eDescr_v4.kernelCh.pTimer = eDescr->kernelCh.pTimer; + } break; + case ncclProfileNetPlugin: { + eDescr_v4.netPlugin.id = eDescr->netPlugin.id; + eDescr_v4.netPlugin.data = eDescr->netPlugin.data; + } break; + default: return ncclSuccess; + } + return ncclProfiler_v4->startEvent(ctx, eHandle, &eDescr_v4); +} + +static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) { + ncclProfilerEventStateArgs_v4_t eStateArgs_v4; + switch(eState) { + case ncclProfilerProxyOpInProgress_v4: + break; + case ncclProfilerProxyStepSendGPUWait: + case ncclProfilerProxyStepSendPeerWait_v4: + case ncclProfilerProxyStepSendWait: + case ncclProfilerProxyStepRecvWait: + case ncclProfilerProxyStepRecvFlushWait: + case ncclProfilerProxyStepRecvGPUWait: + eStateArgs_v4.proxyStep.transSize = eStateArgs->proxyStep.transSize; + break; + case ncclProfilerNetPluginUpdate: + eStateArgs_v4.netPlugin.data = eStateArgs->netPlugin.data; + break; + case ncclProfilerKernelChStop: + eStateArgs_v4.kernelCh.pTimer = eStateArgs->kernelCh.pTimer; + break; + case ncclProfilerProxyCtrlIdle: + case ncclProfilerProxyCtrlActive: + case ncclProfilerProxyCtrlSleep: + case ncclProfilerProxyCtrlWakeup: + case ncclProfilerProxyCtrlAppend: + case ncclProfilerProxyCtrlAppendEnd: + eStateArgs_v4.proxyCtrl.appendedProxyOps = eStateArgs->proxyCtrl.appendedProxyOps; + break; + default: return ncclSuccess; + } + return ncclProfiler_v4->recordEventState(eHandle, (ncclProfilerEventState_v4_t)eState, &eStateArgs_v4); +} + +static ncclResult_t ncclProfiler_init(void** ctx, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nRanks, int rank, ncclDebugLogger_t logfn) { + NCCLCHECK(ncclProfiler_v4->init(ctx, eActivationMask, commName, commId, nNodes, nRanks, rank, logfn)); + ncclProfiler.startEvent = ncclProfiler_startEvent; + ncclProfiler.recordEventState = ncclProfiler_recordEventState; + ncclProfiler.stopEvent = ncclProfiler_v4->stopEvent; + ncclProfiler.finalize = ncclProfiler_v4->finalize; + return ncclSuccess; +} ncclProfiler_t* getNcclProfiler_v4(void* lib) { ncclProfiler_v4 = (ncclProfiler_v4_t*)dlsym(lib, "ncclProfiler_v4"); if (ncclProfiler_v4) { - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v4->name); - return ncclProfiler_v4; + ncclProfiler.name = ncclProfiler_v4->name; + ncclProfiler.init = ncclProfiler_init; + INFO(NCCL_INIT, "PROFILER/Plugin: Loaded %s (v4)", ncclProfiler_v4->name); + return &ncclProfiler; } - INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v4"); return NULL; } diff --git a/src/plugin/profiler/profiler_v5.cc b/src/plugin/profiler/profiler_v5.cc new file mode 100644 index 000000000..01d73db05 --- /dev/null +++ b/src/plugin/profiler/profiler_v5.cc @@ -0,0 +1,21 @@ +/************************************************************************* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "comm.h" +#include "nccl_profiler.h" +#include "checks.h" +#include + +static ncclProfiler_v5_t* ncclProfiler_v5; + +ncclProfiler_t* getNcclProfiler_v5(void* lib) { + ncclProfiler_v5 = (ncclProfiler_v5_t*)dlsym(lib, "ncclProfiler_v5"); + if (ncclProfiler_v5) { + INFO(NCCL_INIT, "PROFILER/Plugin: Loaded %s (v5)", ncclProfiler_v5->name); + return ncclProfiler_v5; + } + return NULL; +} diff --git a/src/plugin/tuner.cc b/src/plugin/tuner.cc index 24a59de2e..dfa21ae7e 100644 --- a/src/plugin/tuner.cc +++ b/src/plugin/tuner.cc @@ -7,6 +7,7 @@ #include #include +#include #include "checks.h" #include "debug.h" @@ -16,8 +17,9 @@ extern ncclTuner_t* getNcclTuner_v2(void* lib); extern ncclTuner_t* getNcclTuner_v3(void* lib); extern ncclTuner_t* getNcclTuner_v4(void* lib); +extern ncclTuner_t* getNcclTuner_v5(void* lib); -pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER; +static std::mutex tunerPluginMutex; static int tunerPluginRefCount; static void* tunerPluginLib = nullptr; static ncclTuner_t* tunerSymbol = nullptr; @@ -33,13 +35,14 @@ enum { static int status = tunerPluginLoadReady; ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) { + const char* tunerName; // Initialize to nullptr by default if plugin tuner cannot be loaded. comm->tuner = nullptr; if (tunerPluginLoadFailed == status) { return ncclSuccess; } - pthread_mutex_lock(&tunerPluginLock); + std::lock_guard lock(tunerPluginMutex); if (tunerPluginLoadFailed == status) { goto exit; } @@ -50,15 +53,26 @@ ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) { goto exit; } - tunerPluginLib = ncclOpenTunerPluginLib(ncclGetEnv("NCCL_TUNER_PLUGIN")); + if ((tunerName = ncclGetEnv("NCCL_TUNER_PLUGIN")) != nullptr) { + INFO(NCCL_ENV|NCCL_TUNING, "NCCL_TUNER_PLUGIN set by environment to %s", tunerName); + if (strcasecmp(tunerName, "none") == 0) + goto fail; + } + tunerPluginLib = ncclOpenTunerPluginLib(tunerName); if (nullptr == tunerPluginLib) { tunerPluginLib = ncclGetNetPluginLib(ncclPluginTypeTuner); if (nullptr == tunerPluginLib) { goto fail; } + tunerName = nullptr; + } else if (ncclPluginLibPaths[ncclPluginTypeTuner]) { + tunerName = ncclPluginLibPaths[ncclPluginTypeTuner]; } - tunerSymbol = getNcclTuner_v4(tunerPluginLib); + tunerSymbol = getNcclTuner_v5(tunerPluginLib); + if (tunerSymbol == NULL) { + tunerSymbol = getNcclTuner_v4(tunerPluginLib); + } if (tunerSymbol == NULL) { tunerSymbol = getNcclTuner_v3(tunerPluginLib); } @@ -66,8 +80,10 @@ ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) { tunerSymbol = getNcclTuner_v2(tunerPluginLib); } if (tunerSymbol == NULL) { + if (tunerName) INFO(NCCL_INIT|NCCL_TUNING, "External tuner plugin %s is unsupported", tunerName); goto fail; } + if (tunerName) INFO(NCCL_INIT|NCCL_TUNING, "Successfully loaded external tuner plugin %s", tunerName); comm->tuner = tunerSymbol; ++tunerPluginRefCount; @@ -75,7 +91,6 @@ ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) { comm->tunerPluginLoaded = 1; exit: - pthread_mutex_unlock(&tunerPluginLock); return ncclSuccess; fail: if (tunerPluginLib) NCCLCHECK(ncclClosePluginLib(tunerPluginLib, ncclPluginTypeTuner)); @@ -85,9 +100,9 @@ ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) { } ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) { - pthread_mutex_lock(&tunerPluginLock); + std::lock_guard lock(tunerPluginMutex); if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) { - INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name); + INFO(NCCL_INIT|NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name); NCCLCHECK(ncclClosePluginLib(tunerPluginLib, ncclPluginTypeTuner)); tunerPluginLib = nullptr; tunerSymbol = nullptr; @@ -95,6 +110,5 @@ ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) { status = tunerPluginLoadReady; comm->tunerPluginLoaded = 0; } - pthread_mutex_unlock(&tunerPluginLock); return ncclSuccess; } diff --git a/src/plugin/tuner/CMakeLists.txt b/src/plugin/tuner/CMakeLists.txt new file mode 100644 index 000000000..71f4498ad --- /dev/null +++ b/src/plugin/tuner/CMakeLists.txt @@ -0,0 +1,10 @@ +# Tuner plugin sources +set(PLUGIN_TUNER_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/tuner_v2.cc + ${CMAKE_CURRENT_SOURCE_DIR}/tuner_v3.cc + ${CMAKE_CURRENT_SOURCE_DIR}/tuner_v4.cc + ${CMAKE_CURRENT_SOURCE_DIR}/tuner_v5.cc +) + +# Add tuner plugin sources to parent scope +set(PLUGIN_TUNER_SOURCES ${PLUGIN_TUNER_SOURCES} PARENT_SCOPE) diff --git a/src/plugin/tuner/tuner_v2.cc b/src/plugin/tuner/tuner_v2.cc index 005638f01..9deefc1fd 100644 --- a/src/plugin/tuner/tuner_v2.cc +++ b/src/plugin/tuner/tuner_v2.cc @@ -46,10 +46,15 @@ static ncclResult_t ncclTuner_getCollInfo(void* context, ncclFunc_t collType, si return ncclSuccess; } -static ncclResult_t ncclTuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, void** context) { - NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logfn, context)); +static ncclResult_t ncclTuner_finalize(void* ctx) { + return ncclTuner_v2->destroy(ctx); +} + +static ncclResult_t ncclTuner_init(void** ctx, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, + ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_t* /*constants*/) { + NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logfn, ctx)); ncclTuner.getCollInfo = ncclTuner_getCollInfo; - ncclTuner.destroy = ncclTuner_v2->destroy; + ncclTuner.finalize = ncclTuner_finalize; return ncclSuccess; } @@ -58,9 +63,8 @@ ncclTuner_t* getNcclTuner_v2(void* lib) { if (ncclTuner_v2) { ncclTuner.name = ncclTuner_v2->name; ncclTuner.init = ncclTuner_init; - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v2->name); + INFO(NCCL_INIT|NCCL_TUNING, "TUNER/Plugin: Using %s (v2)", ncclTuner_v2->name); return &ncclTuner; } - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead."); return NULL; } diff --git a/src/plugin/tuner/tuner_v3.cc b/src/plugin/tuner/tuner_v3.cc index 3898243bc..3f896a644 100644 --- a/src/plugin/tuner/tuner_v3.cc +++ b/src/plugin/tuner/tuner_v3.cc @@ -18,10 +18,15 @@ static ncclResult_t ncclTuner_getCollInfo(void* context, ncclFunc_t collType, si return ncclSuccess; } -static ncclResult_t ncclTuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, void** context) { +static ncclResult_t ncclTuner_finalize(void* ctx) { + return ncclTuner_v3->destroy(ctx); +} + +static ncclResult_t ncclTuner_init(void** context, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, + ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_t* /*constants*/) { NCCLCHECK(ncclTuner_v3->init(nRanks, nNodes, logfn, context)); ncclTuner.getCollInfo = ncclTuner_getCollInfo; - ncclTuner.destroy = ncclTuner_v3->destroy; + ncclTuner.finalize = ncclTuner_finalize; return ncclSuccess; } @@ -30,9 +35,8 @@ ncclTuner_t* getNcclTuner_v3(void* lib) { if (ncclTuner_v3) { ncclTuner.name = ncclTuner_v3->name; ncclTuner.init = ncclTuner_init; - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v3->name); + INFO(NCCL_INIT|NCCL_TUNING, "TUNER/Plugin: Using %s (v3)", ncclTuner_v3->name); return &ncclTuner; } - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol."); return NULL; } diff --git a/src/plugin/tuner/tuner_v4.cc b/src/plugin/tuner/tuner_v4.cc index 4bfd116bb..077ed0aea 100644 --- a/src/plugin/tuner/tuner_v4.cc +++ b/src/plugin/tuner/tuner_v4.cc @@ -7,16 +7,32 @@ #include #include "debug.h" +#include "checks.h" #include "nccl_tuner.h" static ncclTuner_v4_t* ncclTuner_v4; +static ncclTuner_t ncclTuner; + +static ncclResult_t ncclTuner_finalize(void* ctx) { + return ncclTuner_v4->destroy(ctx); +} + +static ncclResult_t ncclTuner_init(void** context, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, + ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_t* /*constants*/) { + NCCLCHECK(ncclTuner_v4->init(nRanks, nNodes, logfn, context)); + ncclTuner.getCollInfo = ncclTuner_v4->getCollInfo; + ncclTuner.finalize = ncclTuner_finalize; + return ncclSuccess; +} ncclTuner_t* getNcclTuner_v4(void* lib) { ncclTuner_v4 = (ncclTuner_v4_t*)dlsym(lib, "ncclTunerPlugin_v4"); if (ncclTuner_v4) { - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v4->name); - return ncclTuner_v4; + ncclTuner.name = ncclTuner_v4->name; + ncclTuner.init = ncclTuner_init; + + INFO(NCCL_INIT|NCCL_TUNING, "TUNER/Plugin: Using %s (v4)", ncclTuner_v4->name); + return &ncclTuner; } - INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol."); return NULL; } diff --git a/src/plugin/tuner/tuner_v5.cc b/src/plugin/tuner/tuner_v5.cc new file mode 100644 index 000000000..22c3d4b42 --- /dev/null +++ b/src/plugin/tuner/tuner_v5.cc @@ -0,0 +1,21 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include "debug.h" +#include "nccl_tuner.h" + +static ncclTuner_v5_t* ncclTuner_v5; + +ncclTuner_t* getNcclTuner_v5(void* lib) { + ncclTuner_v5 = (ncclTuner_v5_t*)dlsym(lib, "ncclTunerPlugin_v5"); + if (ncclTuner_v5) { + INFO(NCCL_INIT|NCCL_TUNING, "TUNER/Plugin: Using %s (v5)", ncclTuner_v5->name); + return ncclTuner_v5; + } + return NULL; +} diff --git a/src/proxy.cc b/src/proxy.cc index 74ec70f0e..25a14cd64 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -13,12 +13,14 @@ #include "timer.h" #include "profiler.h" #include "transport.h" +#include "cpuset.h" #include #include #include #include #include +#include #define NCCL_MAX_PROXY_CONNECTIONS (NCCL_MAX_LOCAL_RANKS+1) @@ -385,6 +387,8 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr sub->workCounter = op->workCounter; args->nsubs = subIndex+1; if (subIndex) { + args->nChannels = std::min(args->nChannels, op->nChannels); + args->nPeers = std::min(args->nPeers, op->nPeers); if ((args->sliceSteps != op->sliceSteps) || (args->chunkSteps != op->chunkSteps) || (args->protocol != op->protocol) || @@ -398,7 +402,7 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr WARN("Proxy append on running operation"); return ncclInternalError; } - return ncclSuccess; + goto exit; } //memset(&args->progress, 0, sizeof(struct ncclProxyArgs)-offsetof(struct ncclProxyArgs, progress)); args->done = 0; @@ -411,11 +415,15 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr args->pattern = op->pattern; args->protocol = op->protocol; args->coll = op->coll; + args->collAPI = op->collAPI; args->algorithm = op->algorithm; + args->nChannels = op->nChannels; + args->nPeers = op->nPeers; args->specifics = op->specifics; args->state = ncclProxyOpReady; args->progress = op->connection->tcomm->proxyProgress; args->proxyAppendPtr = op->connection->proxyAppendPtr; +exit: if (args->pattern != ncclPatternProfiler) ncclProfilerStartProxyOpEvent(subIndex, args); return ncclSuccess; } @@ -744,6 +752,7 @@ static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclPr static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclProxyProgressState* state, struct ncclProxyArgs* opStart, int* idle) { struct ncclProxyArgs* prevOp = NULL; struct ncclProxyArgs* op = opStart; + ncclResult_t status = ncclSuccess; while (op) { if (op->state == ncclProxyOpNone) return ncclInternalError; TIME_START(0); TIME_START(1); @@ -751,6 +760,8 @@ static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclPr if (op->idle) { TIME_STOP(1); TIME_CANCEL(0); } else { TIME_CANCEL(1); TIME_STOP(0); } *idle &= op->idle; if (op->state == ncclProxyOpNone || ret != ncclSuccess) { + //track first error that occured + if (ret != ncclSuccess && status == ncclSuccess) status = ret; TIME_START(2); NCCLCHECK(removeOp(state, &op, &prevOp)); TIME_STOP(2); @@ -759,7 +770,7 @@ static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclPr op = op->next; } } - return ncclSuccess; + return status; } NCCL_PARAM(ProxyAppendBatchSize, "PROXY_APPEND_BATCH_SIZE", 16); @@ -899,16 +910,43 @@ static int setProxyThreadContext(struct ncclProxyState* proxyState) { NCCL_PARAM(ProxyDumpSignal, "PROXY_DUMP_SIGNAL", -1); NCCL_PARAM(ProgressAppendOpFreq, "PROGRESS_APPENDOP_FREQ", 8); +static cpu_set_t proxyCpuset; +static pthread_once_t proxyCpusetOnce = PTHREAD_ONCE_INIT; +void proxyCpusetOnceFunc() { + const char* setEnv = ncclGetEnv("NCCL_PROXY_CPUSET"); + if (setEnv) { + ncclResult_t res = ncclStrListToCpuset(setEnv, &proxyCpuset); + if (res != ncclSuccess) { + INFO(NCCL_ENV, "failed to decode NCCL_PROXY_CPUSET=%s. Ignoring", setEnv); + goto fail; + } + // debug info + char msg[1024] = {0}; + cpu_set_t currSet; + sched_getaffinity(0, sizeof(cpu_set_t), &currSet); + (void)ncclCpusetToStrList(&currSet, msg, sizeof(msg)); + snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), " changed to "); + (void)ncclCpusetToStrList(&proxyCpuset, msg + strlen(msg), sizeof(msg) - strlen(msg)); + INFO(NCCL_ENV, "NCCL_PROXY_CPUSET = %s: %s", setEnv, msg); + return; + } + // if we arrive here we have either no env or we have failed to decode it +fail: + CPU_ZERO(&proxyCpuset); + return; +} + void* ncclProxyProgress(void *proxyState_) { struct ncclProxyState* proxyState = (struct ncclProxyState*)proxyState_; + + // This thread is created by proxyService, therefore setting the affinity is not needed. + INFO(NCCL_INIT, "[Proxy Progress] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu()); + if (setProxyThreadContext(proxyState)) { INFO(NCCL_INIT, "[Proxy Progress] Set CUDA context on device %d", proxyState->cudaDev); } else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) { WARN("[Proxy Progress] Failed to set CUDA device %d", proxyState->cudaDev); } - // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); - - INFO(NCCL_INIT, "[Proxy Progress] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu()); struct ncclProxyProgressState* state = &proxyState->progressState; state->nextOps = -1; @@ -1567,15 +1605,17 @@ enum { void* ncclProxyService(void* _args) { struct ncclProxyState* proxyState = (struct ncclProxyState*) _args; - // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); + + // set the thread affinity before setting the cuda context + pthread_once(&proxyCpusetOnce,proxyCpusetOnceFunc); + if (CPU_COUNT(&proxyCpuset)) sched_setaffinity(0, sizeof(cpu_set_t), &proxyCpuset); + INFO(NCCL_INIT, "[Proxy Service] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu()); + if (setProxyThreadContext(proxyState)) { INFO(NCCL_INIT, "[Proxy Service] Created CUDA context on device %d", proxyState->cudaDev); } else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) { WARN("[Proxy Service] Failed to set CUDA device %d", proxyState->cudaDev); } - // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); - - INFO(NCCL_INIT, "[Proxy Service] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu()); // Prepare poll descriptor struct ncclProxyConnectionPool connectionPool; @@ -1760,14 +1800,17 @@ void* ncclProxyServiceUDS(void* _args) { struct ncclProxyState* proxyState = (struct ncclProxyState*) _args; struct pollfd pollfds[1]; + // set the thread affinity before setting the cuda context + pthread_once(&proxyCpusetOnce,proxyCpusetOnceFunc); + if (CPU_COUNT(&proxyCpuset)) sched_setaffinity(0, sizeof(cpu_set_t), &proxyCpuset); + INFO(NCCL_INIT, "[Proxy Service UDS] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu()); + if (setProxyThreadContext(proxyState)) { INFO(NCCL_INIT, "[Proxy Service UDS] Set CUDA context on device %d", proxyState->cudaDev); } else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) { WARN("[Proxy Service UDS] Failed to set CUDA device %d", proxyState->cudaDev); } - INFO(NCCL_INIT, "[Proxy Service UDS] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu()); - if (ncclIpcSocketGetFd(&proxyState->ipcSock, &pollfds[0].fd) != ncclSuccess) { WARN("[Proxy Service UDS] Get listenSock fd fails"); return NULL; @@ -1807,6 +1850,7 @@ ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union comm->proxyState->listenSock = sock; comm->proxyState->peerAddresses = peerAddresses; comm->proxyState->peerAddressesUDS = peerAddressesUDS; + comm->proxyState->netAttr = NCCL_NET_ATTR_INIT; // UDS support NCCLCHECK(ncclIpcSocketInit(&comm->proxyState->ipcSock, comm->rank, peerAddressesUDS[comm->rank], comm->abortFlag)); @@ -1831,6 +1875,8 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) { proxyState->dmaBufSupport = comm->dmaBufSupport; proxyState->ncclNet = comm->ncclNet; proxyState->ncclCollNet = comm->ncclCollNet; + proxyState->netContext = comm->netContext; + proxyState->collNetContext = comm->collNetContext; proxyState->profilerContext = comm->profilerContext; proxyState->directMode = comm->directMode; memcpy(proxyState->buffSizes, comm->buffSizes, sizeof(comm->buffSizes)); diff --git a/src/ras/CMakeLists.txt b/src/ras/CMakeLists.txt new file mode 100644 index 000000000..2c08b8f99 --- /dev/null +++ b/src/ras/CMakeLists.txt @@ -0,0 +1,11 @@ +# RAS sources +set(RAS_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/collectives.cc + ${CMAKE_CURRENT_SOURCE_DIR}/rasnet.cc + ${CMAKE_CURRENT_SOURCE_DIR}/peers.cc + ${CMAKE_CURRENT_SOURCE_DIR}/ras.cc + ${CMAKE_CURRENT_SOURCE_DIR}/client_support.cc +) + +# Add RAS sources to parent scope +set(RAS_SOURCES ${RAS_SOURCES} PARENT_SCOPE) diff --git a/src/ras/ras.cc b/src/ras/ras.cc index 8ef551c64..948e26446 100644 --- a/src/ras/ras.cc +++ b/src/ras/ras.cc @@ -4,10 +4,6 @@ * See LICENSE.txt for license information ************************************************************************/ -// Workaround for libstdc++ trying to force public visibility of std:: symbols. We don't want to do that in libnccl.so. -#include -#undef _GLIBCXX_VISIBILITY -#define _GLIBCXX_VISIBILITY(V) #include #include #include @@ -76,7 +72,7 @@ static ncclResult_t rasNetSendNack(struct rasSocket* sock); static void* rasThreadMain(void*); -static void rasTerminate() __attribute__((destructor)); +static void rasTerminate(); NCCL_PARAM(RasTimeoutFactor, "RAS_TIMEOUT_FACTOR", 1); @@ -111,6 +107,8 @@ ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank) ncclSetThreadName(rasThread, "NCCL RAS"); rasInitialized = true; + + atexit(rasTerminate); } } ncclAtomicRefCountIncrement(&rasInitRefCount); diff --git a/src/register/CMakeLists.txt b/src/register/CMakeLists.txt new file mode 100644 index 000000000..b3b35bfe6 --- /dev/null +++ b/src/register/CMakeLists.txt @@ -0,0 +1,9 @@ +# Register sources +set(REGISTER_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/register.cc + ${CMAKE_CURRENT_SOURCE_DIR}/coll_reg.cc + ${CMAKE_CURRENT_SOURCE_DIR}/sendrecv_reg.cc +) + +# Add register sources to parent scope +set(REGISTER_SOURCES ${REGISTER_SOURCES} PARENT_SCOPE) diff --git a/src/register/coll_reg.cc b/src/register/coll_reg.cc index d9d9fb436..6cbb0c75f 100644 --- a/src/register/coll_reg.cc +++ b/src/register/coll_reg.cc @@ -1,3 +1,9 @@ +/************************************************************************* + * Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + #include "register.h" #include "transport.h" #include "enqueue.h" @@ -176,7 +182,8 @@ ncclResult_t ncclRegisterCollBuffers( // IPC buffer registration if (info->func == ncclFuncReduceScatter && info->algorithm != NCCL_ALGO_COLLNET_DIRECT) goto exit; if (info->algorithm == NCCL_ALGO_RING && ((info->func == ncclFuncAllReduce && info->sendbuff == info->recvbuff) || info->func == ncclFuncReduce)) goto exit; - if ((info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && info->sendbuff == info->recvbuff) goto exit; + if (info->algorithm == NCCL_ALGO_TREE && info->sendbuff == info->recvbuff) goto exit; + if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN && info->sendbuff == info->recvbuff && comm->maxLocalRanks > 1) goto exit; if (info->func == ncclFuncAllGather && info->algorithm == NCCL_ALGO_PAT) goto exit; int peerRanks[NCCL_MAX_LOCAL_RANKS]; diff --git a/src/register/register.cc b/src/register/register.cc index 59928f57e..b118a4cc4 100644 --- a/src/register/register.cc +++ b/src/register/register.cc @@ -14,18 +14,6 @@ NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1); -static ncclResult_t regFindHandleFromSymAddr(struct ncclComm* comm, void* baseSymPtr, struct ncclReg** handle) { - struct ncclRegCache* cache = &comm->regCache; - *handle = NULL; - for (int slot = 0; slot < cache->population; slot++) { - if (baseSymPtr == cache->slots[slot]->baseSymPtr) { - *handle = cache->slots[slot]; - break; - } - } - return ncclSuccess; -} - ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid) { if (reg && isValid) { if (reg->localRefs) @@ -174,104 +162,3 @@ ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *hand NCCLCHECK(commDeregister(comm, true, handle)); return ncclSuccess; } - -ncclResult_t ncclCommSymmetricRegisterInternal(struct ncclComm* comm, void* buff, size_t baseSize, size_t alignment, CUmemGenericAllocationHandle memHandle, struct ncclReg* regHandle) { - ncclResult_t ret = ncclSuccess; - void* regSymAddr = NULL; - ALIGN_SIZE(comm->symAllocHead, alignment); - NCCLCHECKGOTO(ncclIpcSymmetricMap(comm, comm->symAllocHead, baseSize, memHandle, ®SymAddr), ret, fail); - NCCLCHECKGOTO(ncclNvlsSymmetricMap(comm, comm->symAllocHead, baseSize, regSymAddr), ret, fail); - NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail); - comm->symAllocHead += baseSize; - regHandle->baseSymPtr = regSymAddr; - regHandle->symSize = baseSize; -exit: - return ret; -fail: - regHandle->baseSymPtr = NULL; - regHandle->symSize = 0; - goto exit; -} - -NCCL_API(ncclResult_t, ncclCommWindowRegister, ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags); -ncclResult_t ncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags) { - ncclResult_t ret = ncclSuccess; - CUmemGenericAllocationHandle memHandle; - size_t baseSize; - void* baseAddr = NULL; - struct ncclReg* regHandle = NULL; - int saveDev; - - *win = NULL; - - CUDACHECK(cudaGetDevice(&saveDev)); - NCCLCHECK(ncclGroupStartInternal()); - if (!ncclParamLocalRegister() || !ncclCuMemEnable()) { - goto exit; - } - - NCCLCHECKGOTO(ncclCommEnsureReady(comm), ret, fail); - - CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail); - if (comm && buff && size && win) { - size_t alignment = 0; - CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)buff), ret, fail); - // size and alignment check - if (!((uintptr_t)baseAddr % NCCL_REC_PAGE_SIZE == 0 && baseSize % NCCL_REC_PAGE_SIZE == 0 && (uintptr_t)buff + size <= (uintptr_t)baseAddr + baseSize)) { - WARN("buffer %p (baseAddr %p align %d) size %zu (baseSize %ld align %d) does not satisfy symmetric registration requirements", buff, baseAddr, (uintptr_t)baseAddr % NCCL_REC_PAGE_SIZE == 0, size, baseSize, baseSize % NCCL_REC_PAGE_SIZE == 0); - goto fail; - } - NCCLCHECKGOTO(ncclRegister(comm, baseAddr, baseSize, false, (void**)®Handle), ret, fail); - NCCLCHECKGOTO(ncclCalloc(win, 1), ret, fail); - (*win)->handle = regHandle; - regHandle->winFlags = winFlags; - if (regHandle->baseSymPtr == NULL && comm->symmetricSupport) { - struct ncclSymRegTask* task; - CUCHECKGOTO(cuMemRetainAllocationHandle(&memHandle, baseAddr), ret, fail); - CUCHECKGOTO(cuMemRelease(memHandle), ret, fail); - alignment = baseSize >= NCCL_REC_PAGE_SIZE * 72L ? NCCL_MAX_PAGE_SIZE : NCCL_REC_PAGE_SIZE; - NCCLCHECKGOTO(ncclCalloc(&task, 1), ret, fail); - task->buff = buff; - task->baseSize = baseSize; - task->memHandle = memHandle; - task->regHandle = regHandle; - task->alignment = alignment; - ncclIntruQueueEnqueue(&comm->symRegTaskQueue, task); - ncclGroupCommJoin(comm, ncclGroupTaskTypeSymRegister); - } - } - -exit: - ncclGroupErrCheck(ret); - NCCLCHECK(ret = ncclGroupEndInternal()); - cudaSetDevice(saveDev); - return ret; -fail: - free(*win); - *win = NULL; - goto exit; -} - -NCCL_API(ncclResult_t, ncclCommWindowDeregister, ncclComm_t comm, ncclWindow_t win); -ncclResult_t ncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win) { - ncclResult_t ret = ncclSuccess; - int saveDev; - struct ncclReg* regHandle; - CUDACHECK(cudaGetDevice(&saveDev)); - if (win == NULL) goto exit; - regHandle = win->handle; - if (regHandle && ncclParamLocalRegister() && ncclCuMemEnable()) { - if (regHandle->baseSymPtr) { - CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail); - NCCLCHECKGOTO(ncclNvlsSymmetricFree(comm, regHandle->symSize, regHandle->baseSymPtr), ret, fail); - NCCLCHECKGOTO(ncclIpcSymmetricFree(comm, regHandle->symSize, regHandle->baseSymPtr), ret, fail); - } - NCCLCHECKGOTO(commDeregister(comm, false, regHandle), ret, fail); - } - free(win); -exit: - CUDACHECK(cudaSetDevice(saveDev)); - return ret; -fail: - goto exit; -} diff --git a/src/register/sendrecv_reg.cc b/src/register/sendrecv_reg.cc index f82fbd714..9114fab01 100644 --- a/src/register/sendrecv_reg.cc +++ b/src/register/sendrecv_reg.cc @@ -1,3 +1,9 @@ +/************************************************************************* + * Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + #include "register.h" #include "transport.h" diff --git a/src/scheduler/CMakeLists.txt b/src/scheduler/CMakeLists.txt new file mode 100644 index 000000000..f6583bd4d --- /dev/null +++ b/src/scheduler/CMakeLists.txt @@ -0,0 +1,7 @@ +# Scheduler sources +set(SCHEDULER_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/symmetric_sched.cc +) + +# Add scheduler sources to parent scope +set(SCHEDULER_SOURCES ${SCHEDULER_SOURCES} PARENT_SCOPE) diff --git a/src/scheduler/symmetric_sched.cc b/src/scheduler/symmetric_sched.cc new file mode 100644 index 000000000..440b6061b --- /dev/null +++ b/src/scheduler/symmetric_sched.cc @@ -0,0 +1,235 @@ +/************************************************************************* + * Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_SYMMETRIC_SCHED_H_ +#define NCCL_SYMMETRIC_SCHED_H_ + +#include "scheduler.h" + +ncclResult_t ncclMakeSymmetricTaskList(struct ncclComm* comm, struct ncclTaskColl* task, struct ncclIntruQueue* symTaskQueue, struct ncclTaskColl** remainTasksHead) { + ncclResult_t ret = ncclSuccess; + int fnOpTySymCount = 0; + struct ncclTaskColl* tasksSymByFnOpTy[ncclNumFuncs * ncclNumDevRedOps * ncclNumTypes]; + int fnOpTySymIndices[ncclNumFuncs * ncclNumDevRedOps * ncclNumTypes]; + struct ncclKernelPlanner* planner = &comm->planner; + struct ncclTaskColl* remainTasksTail = nullptr; + + memset(tasksSymByFnOpTy, 0, sizeof(tasksSymByFnOpTy)); + *remainTasksHead = nullptr; + while (task != nullptr) { + int index = ((int)task->func*ncclNumDevRedOps + (int)task->opDev.op)*ncclNumTypes + (int)task->datatype; + struct ncclTaskColl* next = task->next; + NCCLCHECK(ncclDevrFindWindow(comm, task->sendbuff, &task->sendWin)); + NCCLCHECK(ncclDevrFindWindow(comm, task->recvbuff, &task->recvWin)); + bool symAvailable = ncclSymkAvailable(comm, task->func, task->opDev.op, task->datatype, task->count); + + if (task->sendWin && task->recvWin && (task->sendWin->winFlags & task->recvWin->winFlags & NCCL_WIN_COLL_SYMMETRIC) && symAvailable) { + if (tasksSymByFnOpTy[index] == nullptr) fnOpTySymIndices[fnOpTySymCount++] = index; + task->next = tasksSymByFnOpTy[index]; + tasksSymByFnOpTy[index] = task; + planner->nTasksColl--; + } else { + if (*remainTasksHead) { + remainTasksTail->next = task; + remainTasksTail = task; + } else { + *remainTasksHead = remainTasksTail = task; + } + } + task = next; + } + if (remainTasksTail) remainTasksTail->next = nullptr; + + // make sure kernel args space can hold at least a single work + assert(comm->workArgsBytes >= ncclSymkDevWorkArgs::calcArgsSize(MAXCHANNELS, 1)); + + // Determine symmetric tasks kernels + for (int cursor = 0; cursor < fnOpTySymCount; cursor++) { + struct ncclTaskColl* task = tasksSymByFnOpTy[fnOpTySymIndices[cursor]]; + while (task != NULL) { + ncclSymkKernelId kernelId = ncclSymkKernelId_Count; + int nChannels = MAXCHANNELS; + int nWarps = 0; + int nWorks = 0; + float estTimeUs = 1.e18; + size_t countTotal = 0, countMax = 0; + struct ncclTaskColl* headTask = task; + size_t cellCount = NCCL_SYM_KERNEL_CELL_SIZE / ncclTypeSize(headTask->datatype); + // For now we assume higher kernel id means a kernel for larger data size + while (task != nullptr) { + size_t count; + nWorks++; + count = alignUp(task->count, cellCount); + countTotal += count; + if (count > countMax) countMax = count; + if (ncclSymkDevWorkArgs::calcArgsSize(MAXCHANNELS, nWorks + 1) > comm->workArgsBytes || task->next == nullptr) { + task->isSymLast = 1; + break; + } + task = task->next; + } + NCCLCHECK(ncclSymkPickKernel(comm, headTask->func, headTask->opDev.op, headTask->datatype, + countTotal, countMax, nWorks, + &estTimeUs, &kernelId, &nChannels, &nWarps)); + if (kernelId == ncclSymkKernelId_Count) { + char const* name = ncclGetEnv("NCCL_SYM_KERNEL"); + WARN("Error: no symmetric kernel available for function %s.%s%s", + ncclFuncToString(headTask->func), (name ? " NCCL_SYM_KERNEL was set to " : ""), (name ? name: "")); + ret = (name ? ncclInvalidUsage : ncclInternalError); + goto fail; + } + // set all symmetric tasks to the same kernel + task = headTask; + while (task != nullptr) { + struct ncclTaskColl* next = task->next; + int isSymLast = task->isSymLast; + task->devFuncId = (uint32_t)kernelId; + task->nMaxChannels = nChannels; + task->nWarps = nWarps; + ncclIntruQueueEnqueue(&planner->collSymTaskQueue, task); + task = next; + if (isSymLast) break; + } + } + } + +exit: + return ret; +fail: + goto exit; +} + +ncclResult_t ncclSymmetricTaskScheduler(struct ncclComm* comm, struct ncclIntruQueue* symTaskQueue, struct ncclKernelPlan* plan) { + struct ncclTaskColl* headTask = ncclIntruQueueHead(symTaskQueue); + int devFuncId = headTask->devFuncId; + struct ncclTaskColl* task = NULL; + ssize_t totalCount = 0; // aligned bytes + ssize_t logCount = 0; + ssize_t remainCell = 0; + ssize_t cellPerChannel = 0; + int workCount = 0, workIndex = 0; + size_t cellCount = NCCL_SYM_KERNEL_CELL_SIZE / ncclTypeSize(headTask->datatype); // minimal cell size + ncclResult_t ret = ncclSuccess; + int curChannel = 0; + int curChannelWork = 0; + int nMaxChannels = headTask->nMaxChannels; + struct ncclSymkDevWork* workBufPtr = NULL; + struct ncclSymkChannelWorkRange* workRangePtr = NULL; + const char* funcName = ncclFuncToString(headTask->func); + const char* kernelName = ncclSymkKernelIdToString(headTask->devFuncId); + struct ncclSymkDevWorkArgs* argsBuf = NULL; + + plan->isSymColl = true; + plan->threadPerBlock = headTask->nWarps * WARP_SIZE; + plan->hasProxyOps = false; + plan->kernelFn = ncclSymkGetKernelPtr((ncclSymkKernelId)headTask->devFuncId, headTask->opDev.op, headTask->datatype); + task = headTask; + while (task != nullptr && task->devFuncId == devFuncId) { + workCount++; + totalCount += alignUp(task->count, cellCount); + logCount += task->count; + if (task->isSymLast == 1) break; + task = task->next; + } + + plan->kernelArgsSize = ncclSymkDevWorkArgs::calcArgsSize(nMaxChannels, workCount); + argsBuf = (struct ncclSymkDevWorkArgs*)calloc(1, plan->kernelArgsSize); + + remainCell = cellPerChannel = DIVUP(DIVUP(totalCount, nMaxChannels), cellCount); + workRangePtr = argsBuf->getWorkRange(); + workBufPtr = argsBuf->getWorks(nMaxChannels); + argsBuf->nMaxChannels = nMaxChannels; + + while (!ncclIntruQueueEmpty(symTaskQueue)) { + struct ncclSymkDevWork devWork = {}; + size_t cellLeft = 0, taskCell = 0; + uint8_t isSymLast = 0; + + if (ncclIntruQueueHead(symTaskQueue)->devFuncId != devFuncId) break; // scheduling is done + + task = ncclIntruQueueDequeue(symTaskQueue); + isSymLast = task->isSymLast; + + NCCLCHECKGOTO(ncclSymkMakeDevWork(comm, task, &devWork), ret, fail); + + cellLeft = taskCell = DIVUP(task->count, cellCount); + for (;curChannel < nMaxChannels;) { + workRangePtr[curChannel].workHi = workIndex; + if (curChannelWork == 0) { + if (devWork.nChannels == 0) { + devWork.sChannelId = curChannel; + devWork.nChannels = 1; + } else if (cellLeft <= remainCell) { + // the last segment of the task + assert(devWork.nChannels > 0); + // if the remaining cell is less than 1024 bytes, we can fuse the last channel + if ((remainCell - cellLeft) * NCCL_SYM_KERNEL_CELL_SIZE <= (1 << 10) || ncclIntruQueueEmpty(symTaskQueue)) devWork.nChannels++; + } else { + // middle segment of the task + devWork.nChannels++; + } + } else { + assert(cellLeft == taskCell); + if (taskCell <= remainCell) { + // the first segment of the task is fully scheduled onto the channel + devWork.sChannelId = curChannel; + devWork.nChannels = 1; + } + } + if (cellLeft < remainCell) { + workRangePtr[curChannel].fracHi = uint16_t(0x10000UL - 1); + remainCell -= cellLeft; + curChannelWork++; + break; + } else if (cellLeft == remainCell) { + workRangePtr[curChannel].fracHi = uint16_t(0x10000UL - 1); + remainCell = cellPerChannel; + curChannel++; + curChannelWork = 0; + break; + } else { + // cellLeft > remainCell; the task is partially scheduled onto the channel + cellLeft -= remainCell; + workRangePtr[curChannel].fracHi = uint16_t(DIVUP(0x10000L * (taskCell - cellLeft), taskCell) - 1); + remainCell = cellPerChannel; + curChannel++; + curChannelWork = 0; + } + } + memcpy(workBufPtr + workIndex, &devWork, sizeof(struct ncclSymkDevWork)); + workIndex++; + + // Profiler + plan->groupApiEventHandle = task->groupApiEventHandle; + + ncclMemoryPoolFree(&comm->memPool_ncclTaskColl, task); + if (isSymLast == 1) break; + if (curChannel == nMaxChannels) { + WARN("ncclSymmetricTaskScheduler ran out of channel space (nMaxChannels=%d, workCount=%d, workIndex=%d)", + nMaxChannels, workCount, workIndex); + goto fail; + } + } + if (remainCell < cellPerChannel) curChannel++; + + memcpy(&argsBuf->kcomm, &comm->symkState.kcomm, sizeof(comm->symkState.kcomm)); + plan->workBytes = totalCount * ncclTypeSize(headTask->datatype); + plan->channelMask = uint64_t(-1) >> (64 - curChannel); + plan->kernelSymArgs = (void*)argsBuf; + plan->workStorageType = ncclDevWorkStorageTypeArgs; + + if (comm->rank == 0) { + INFO(NCCL_TUNING, "%s [Symmetric]: %ld Bytes -> Kernel %s nchannels %d nthreads %d nWorks %d", funcName, + logCount * ncclTypeSize(headTask->datatype), kernelName, curChannel, plan->threadPerBlock, workCount); + } + +exit: + return ret; +fail: + goto exit; +} + +#endif // NCCL_SYMMETRIC_SCHED_H_ diff --git a/src/symmetric.cc b/src/sym_kernels.cc similarity index 52% rename from src/symmetric.cc rename to src/sym_kernels.cc index f5b1e6c22..df4965d56 100644 --- a/src/symmetric.cc +++ b/src/sym_kernels.cc @@ -1,14 +1,22 @@ -#include "symmetric.h" +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "sym_kernels.h" #include "comm.h" #include "device.h" +#include "transport.h" #include constexpr char const* kernelName[] = { - // Must align with enum ncclSymKernelId definition in src/include/symmetric.h + // Must align with enum ncclSymkKernelId definition in src/include/sym_kernels.h "AllReduce_AGxLL_R", "AllReduce_AGxLLMC_R", "AllReduce_RSxLD_AGxST", "AllReduce_RSxLDMC_AGxSTMC", + "AllReduce_RSxNet_ARxMC_AGxNet", "AllGather_LL", "AllGather_LLMC", "AllGather_ST", @@ -18,34 +26,34 @@ constexpr char const* kernelName[] = { "ReduceScatter_LDMC" }; -constexpr uint32_t kernelMask_STMC = 1<nRanks; - int nMaxBlocks = ncclSymMaxBlocks; + int nMaxBlocks = ncclSymkMaxBlocks; int nMaxBlocksNvls = divUp((comm->cudaArch < 1000 ? 16 : 32), nRanks); size_t busBytes; // max(bytes sent, bytes received) double busMultiplier = 1; @@ -116,45 +124,45 @@ static void queryModel(struct ncclComm* comm, ncclSymKernelId k, size_t nBytes, busBytes = size_t(1)<<50; break; - case ncclSymKernelId_AllReduce_AGxLL_R: + case ncclSymkKernelId_AllReduce_AGxLL_R: busBytes = nRanks*nBytes*LL_BusFactor; break; - case ncclSymKernelId_AllReduce_AGxLLMC_R: + case ncclSymkKernelId_AllReduce_AGxLLMC_R: busBytes = nRanks*nBytes*LL_BusFactor; busMultiplier = 1.1; // To beat non-MC LL break; - case ncclSymKernelId_AllReduce_RSxLD_AGxST: + case ncclSymkKernelId_AllReduce_RSxLD_AGxST: busBytes = 2*nBytes*(nRanks-1)/nRanks; break; - case ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC: + case ncclSymkKernelId_AllReduce_RSxLDMC_AGxSTMC: busBytes = nBytes/nRanks + nBytes; busMultiplier = nRanks; nMaxBlocks = nMaxBlocksNvls; break; - case ncclSymKernelId_AllGather_LL: + case ncclSymkKernelId_AllGather_LL: busBytes = nRanks*nBytes*LL_BusFactor; break; - case ncclSymKernelId_AllGather_LLMC: + case ncclSymkKernelId_AllGather_LLMC: busBytes = nRanks*nBytes*LL_BusFactor; busMultiplier = 1.1; // To beat non-MC LL break; - case ncclSymKernelId_AllGather_ST: + case ncclSymkKernelId_AllGather_ST: busBytes = (nRanks-1)*nBytes; break; - case ncclSymKernelId_AllGather_STMC: + case ncclSymkKernelId_AllGather_STMC: busBytes = (nRanks-1)*nBytes; // Wrong. Should be nRanks*nBytes but we want to beat non-MC. busMultiplier = 0.55*nRanks; nMaxBlocks = nMaxBlocksNvls; break; - case ncclSymKernelId_ReduceScatter_LL: + case ncclSymkKernelId_ReduceScatter_LL: busBytes = nRanks*nBytes*LL_BusFactor; break; - case ncclSymKernelId_ReduceScatter_LD: + case ncclSymkKernelId_ReduceScatter_LD: busBytes = (nRanks-1)*nBytes; break; - case ncclSymKernelId_ReduceScatter_LDMC: + case ncclSymkKernelId_ReduceScatter_LDMC: busBytes = (nRanks-1)*nBytes; // Wrong. Should be nRanks*nBytes but we want to beat non-MC. busMultiplier = 0.55*nRanks; nMaxBlocks = nMaxBlocksNvls; @@ -164,7 +172,7 @@ static void queryModel(struct ncclComm* comm, ncclSymKernelId k, size_t nBytes, nMaxBlocks = std::min(nMaxBlocks, comm->config.maxCTAs); int nMinBlocks = comm->config.minCTAs; - int nUserCTAs = std::min(ncclSymMaxBlocks, ncclParamSymCTAs()); + int nUserCTAs = std::min(ncclSymkMaxBlocks, ncclParamSymCTAs()); if (nUserCTAs > 0) nMinBlocks = nMaxBlocks = nUserCTAs; bool isLL = kernelMask_LL>>k & 1; @@ -175,11 +183,11 @@ static void queryModel(struct ncclComm* comm, ncclSymKernelId k, size_t nBytes, if (comm->cudaArch < 1000) { baseLat = isLL ? 4.5 : 7.8; smBw = isAR ? 65*GBps : 44*GBps; - peakBw = k == ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC ? 480*GBps : 320*GBps; + peakBw = k == ncclSymkKernelId_AllReduce_RSxLDMC_AGxSTMC ? 480*GBps : 320*GBps; } else { baseLat = isLL ? (isAG ? 8.5 : 11) : (isAR ? 19.5 : 13.0); smBw = 55*GBps; - peakBw = k == ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC ? 1000*GBps : 600*GBps; + peakBw = k == ncclSymkKernelId_AllReduce_RSxLDMC_AGxSTMC ? 1000*GBps : 600*GBps; } *nBlocks = nMaxBlocks; *timeUs = model(busBytes, baseLat, nMaxBlocks, smBw, busMultiplier, peakBw); @@ -194,7 +202,36 @@ static void queryModel(struct ncclComm* comm, ncclSymKernelId k, size_t nBytes, } } -bool ncclSymImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty) { +ncclResult_t ncclSymkInitOnce(struct ncclComm* comm) { + struct ncclSymkState* symk = &comm->symkState; + if (!symk->initialized) { + symk->initialized = true; + struct ncclDevCommRequirements reqs = {}; + reqs.lsaMultimem = comm->nvlsSupport; + reqs.lsaBarrierCount = ncclSymkMaxBlocks; + + struct ncclDevResourceRequirements lla2aReq; + ncclLLA2ACreateRequirement( + ncclSymkMaxBlocks, ncclLLA2ACalcSlots(ncclTeamLsa(comm).nRanks*ncclSymkMaxThreads, ncclSymkLLMaxEltSize), + &symk->kcomm.lsaLLA2A, &lla2aReq + ); + lla2aReq.next = reqs.resourceRequirementsList; + reqs.resourceRequirementsList = &lla2aReq; + + NCCLCHECK(ncclDevrCommCreateInternal(comm, &reqs, &symk->kcomm.devComm)); + } + return ncclSuccess; +} + +ncclResult_t ncclSymkFinalize(struct ncclComm* comm) { + struct ncclSymkState* symk = &comm->symkState; + if (symk->initialized) { + NCCLCHECK(ncclDevCommDestroy(comm, &symk->kcomm.devComm)); + } + return ncclSuccess; +} + +static bool ncclSymkImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty) { bool isFloat; switch (ty) { case ncclFloat64: @@ -221,10 +258,7 @@ bool ncclSymImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType } } -ncclResult_t ncclSymPickKernel( - struct ncclComm* comm, ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, size_t nElts, - float* estTimeUs, ncclSymKernelId* kernelId, int* nBlocks, int* nWarps - ) { +static uint32_t ncclSymkMask(struct ncclComm* comm, ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, size_t nElts) { uint32_t kmask = kernelMask_coll(coll); kmask &= kernelMask_user(); @@ -263,14 +297,37 @@ ncclResult_t ncclSymPickKernel( // to be at least 32 bytes per chunk) if (nBusBytes >= 32*(size_t(2)<<30)) kmask = 0; - ncclSymKernelId bestKernel = ncclSymKernelId_Count; + return kmask; +} + +bool ncclSymkAvailable(struct ncclComm* comm, ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, + ncclDataType_t ty, size_t nElts) { + if (!ncclSymkImplemented(coll, red, ty)) + return false; + + return (ncclSymkMask(comm, coll, red, ty, nElts) != 0); +} + +ncclResult_t ncclSymkPickKernel( + struct ncclComm* comm, ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, + size_t nEltsTotal, size_t nEltsMax, int nWorks, + float* estTimeUs, ncclSymkKernelId* kernelId, int* nBlocks, int* nWarps + ) { + uint32_t kmask = ncclSymkMask(comm, coll, red, ty, nEltsMax); + + // We currently don't support grouping for LL kernels. + if (nWorks > 1) + kmask &= ~kernelMask_LL; + + ncclSymkKernelId bestKernel = ncclSymkKernelId_Count; float bestTime = 1.e30f; int bestBlocks = 999; + size_t nBytes = nEltsTotal*ncclTypeSize(ty); constexpr float smPenalty = .025f; // 2.5% percent increase in time per SM uint32_t kmaskRemain = kmask; while (kmaskRemain != 0) { - ncclSymKernelId k = (ncclSymKernelId)popFirstOneBit(&kmaskRemain); + ncclSymkKernelId k = (ncclSymkKernelId)popFirstOneBit(&kmaskRemain); float kTime; int kBlocks; queryModel(comm, k, nBytes, &kTime, &kBlocks); @@ -282,15 +339,29 @@ ncclResult_t ncclSymPickKernel( } *kernelId = bestKernel; - *estTimeUs = kmask==0 || kernelMask_user() == (1<= ncclSymKernelId_Count) { +const char* ncclSymkKernelIdToString(int kernelId) { + if (kernelId < 0 || kernelId >= ncclSymkKernelId_Count) { return "Unknown"; } return kernelName[kernelId]; } + +/* this function fills in the devWork except nextWorkOffset */ +ncclResult_t ncclSymkMakeDevWork(struct ncclComm* comm, struct ncclTaskColl* task, struct ncclSymkDevWork* outDevWork) { + outDevWork->rootRank = task->root; + outDevWork->redOpArg = task->opDev.scalarArg; + outDevWork->nElts = task->count; + outDevWork->inputWin = task->sendWin->vidmem; + outDevWork->inputOff = (uint8_t*)task->sendbuff - (uint8_t*)task->sendWin->userPtr; + outDevWork->outputWin = task->recvWin->vidmem; + outDevWork->outputOff = (uint8_t*)task->recvbuff - (uint8_t*)task->recvWin->userPtr; + outDevWork->sChannelId = 0xffff; + outDevWork->nChannels = 0; + return ncclSuccess; +} diff --git a/src/transport/CMakeLists.txt b/src/transport/CMakeLists.txt new file mode 100644 index 000000000..0485008c0 --- /dev/null +++ b/src/transport/CMakeLists.txt @@ -0,0 +1,15 @@ +# Transport sources +set(TRANSPORT_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/nvls.cc + ${CMAKE_CURRENT_SOURCE_DIR}/profiler.cc + ${CMAKE_CURRENT_SOURCE_DIR}/net_socket.cc + ${CMAKE_CURRENT_SOURCE_DIR}/p2p.cc + ${CMAKE_CURRENT_SOURCE_DIR}/net.cc + ${CMAKE_CURRENT_SOURCE_DIR}/net_ib.cc + ${CMAKE_CURRENT_SOURCE_DIR}/coll_net.cc + ${CMAKE_CURRENT_SOURCE_DIR}/shm.cc + ${CMAKE_CURRENT_SOURCE_DIR}/generic.cc +) + +# Add transport sources to parent scope +set(TRANSPORT_SOURCES ${TRANSPORT_SOURCES} PARENT_SCOPE) diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc index 386865e21..6cf6b18c7 100644 --- a/src/transport/coll_net.cc +++ b/src/transport/coll_net.cc @@ -355,7 +355,7 @@ static ncclResult_t sharedListen(struct ncclProxyState* proxyState, int netDev, collNet->resources = resources; } if (resources->collNetComms[netDev] == NULL) - NCCLCHECK(proxyState->ncclCollNet->listen(netDev, collNetHandle, resources->collNetListenComms + netDev)); + NCCLCHECK(proxyState->ncclCollNet->listen(proxyState->collNetContext, netDev, collNetHandle, resources->collNetListenComms + netDev)); return ncclSuccess; } @@ -1223,14 +1223,19 @@ ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* u ncclResult_t ret = ncclSuccess; struct ncclReg *regRecord = NULL; bool isValid = false; + void *base = NULL; + size_t baseSize = 0; *outRegBufFlag = 0; *outHandle = NULL; if (comm && userbuff && buffSize > 0) { NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, ®Record), ret, fail); NCCLCHECKGOTO(ncclRegLocalIsValid(regRecord, &isValid), ret, fail); - if (isValid) - NCCLCHECKGOTO(collnetRegisterBuffer(comm, userbuff, buffSize, type, regRecord, outRegBufFlag, outHandle), ret, fail); + if (isValid) { + CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&base, &baseSize, (CUdeviceptr)userbuff), ret, fail); + if ((uint64_t)base + baseSize < (uint64_t)userbuff + buffSize) goto exit; + } + NCCLCHECKGOTO(collnetRegisterBuffer(comm, userbuff, buffSize, type, regRecord, outRegBufFlag, outHandle), ret, fail); } exit: return ret; @@ -1256,13 +1261,14 @@ ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* u ncclResult_t ret = ncclSuccess; struct ncclCollnetCleanupCallback* record = NULL; struct ncclReg *regRecord = NULL; - void *baseSend = NULL; - size_t baseSendSize = 0; + void *base = NULL; + size_t baseSize = 0; *outRegBufFlag = 0; if (comm && userbuff && buffSize > 0) { - CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)userbuff), ret, fail); - NCCLCHECKGOTO(ncclCommGraphRegister(comm, baseSend, baseSendSize, (void**)®Record), ret, fail); + CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&base, &baseSize, (CUdeviceptr)userbuff), ret, fail); + if ((uint64_t)base + baseSize < (uint64_t)userbuff + buffSize) goto exit; + NCCLCHECKGOTO(ncclCommGraphRegister(comm, base, baseSize, (void**)®Record), ret, fail); NCCLCHECKGOTO(collnetRegisterBuffer(comm, userbuff, buffSize, type, regRecord, outRegBufFlag, outHandle), ret, fail); if (*outRegBufFlag) { @@ -1473,11 +1479,7 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop ncclResult_t ret = ncclSuccess; int rank = comm->rank; int collNetSetupFail = 0; - // Find all head ranks - int nHeadsUnique = 0; - int* headsUnique = NULL; bool share; - struct ncclTopoGraph* directGraph = graphs[NCCL_ALGO_COLLNET_DIRECT]; struct collnetShareInfo { int headPosition; @@ -1485,20 +1487,30 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop }; struct collnetShareInfo* infos = NULL; - NCCLCHECKGOTO(ncclCalloc(&headsUnique, directGraph->nChannels), ret, fail); - { uint64_t mask = 0; + struct ncclTopoGraph* collNetGraph; + + if (!comm->nvlsSupport) { + collNetGraph = graphs[NCCL_ALGO_COLLNET_DIRECT]; + NCCLCHECKGOTO(ncclCalloc(&comm->collNetHeads, collNetGraph->nChannels), ret, fail); + uint64_t mask = 0; // Head GPU index is always 0 - for (int c = 0; c < directGraph->nChannels; c++) { - int head = directGraph->intra[c * comm->localRanks + 0]; + for (int c = 0; c < collNetGraph->nChannels; c++) { + int head = collNetGraph->intra[c * comm->localRanks + 0]; assert(comm->rankToNode[head] == comm->node); uint64_t mask0 = mask; mask |= 1ull<rankToLocalRank[head]; - if (mask != mask0) headsUnique[nHeadsUnique++] = head; + if (mask != mask0) comm->collNetHeads[comm->collNetHeadsNum++] = head; } + } else { + // Use the NVLS graph to get the head ranks for collnet setup. comm->nvlsHeads already has unique heads. + // nHeads is the same on all the channels, see connectNvls function + collNetGraph = graphs[NCCL_ALGO_NVLS]; + NCCLCHECKGOTO(ncclCalloc(&comm->collNetHeads, collNetGraph->nChannels), ret, fail); + comm->collNetHeadsNum = comm->channels[0].nvls.nHeads; + // Copy over comm->collNetHeads from comm->nvlsHeads since they are freed in different places. + memcpy(comm->collNetHeads, comm->nvlsHeads, comm->collNetHeadsNum * sizeof(int)); } - comm->collNetHeads = headsUnique; - comm->collNetHeadsNum = nHeadsUnique; if (parent && parent->config.collnetEnable && parent->nNodes == comm->nNodes) { if (!parent->shareResources) { collNetSetupFail = 1; @@ -1508,7 +1520,7 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop /* check whether child can share collnet resources of parent. Since parent builds each collnet communicator * based on heads with the same head position in each node, as long as the collnet heads of child comm * can match parent's heads, we can let child communicator share parent's collnet resources. */ - for (int h = 0; h < nHeadsUnique; ++h) { + for (int h = 0; h < comm->collNetHeadsNum; ++h) { int prev = INT_MIN; struct collnetShareInfo* myinfo; @@ -1516,7 +1528,7 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop myinfo = infos + comm->rank; memset(myinfo, 0, sizeof(struct collnetShareInfo)); /* find the child head position in parent collnet heads. */ - if (headsUnique[h] == comm->rank) { + if (comm->collNetHeads[h] == comm->rank) { myinfo->headPosition = -1; myinfo->isMaster = 1; for (int th = 0; th < parent->collNetHeadsNum; ++th) @@ -1567,11 +1579,11 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop for (int c = 0; c < comm->nChannels; c++) { struct ncclChannel* channel = comm->channels + c; NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, false), ret, fail); - for (int h = 0; h < nHeadsUnique; h++) { - const int head = headsUnique[h]; + for (int h = 0; h < comm->collNetHeadsNum; h++) { + const int head = comm->collNetHeads[h]; ncclConnect connect; - collNetSetupFail |= ncclTransportCollNetSetup(comm, directGraph, channel, head, head, h, collNetRecv, &connect); - if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, directGraph, channel, head, head, h, collNetSend, &connect); + collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv, &connect); + if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend, &connect); } // Verify CollNet setup across ranks after trying the first channel if (c == 0) { @@ -1592,7 +1604,7 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop bool isHead = false; matrix = nullptr; NCCLCHECKGOTO(ncclCalloc(&matrix, comm->nRanks), ret, matrix_end); - for (int h = 0; h < nHeadsUnique; h++) isHead |= (headsUnique[h] == comm->rank); + for (int h = 0; h < comm->collNetHeadsNum; h++) isHead |= (comm->collNetHeads[h] == comm->rank); if (isHead) { for (int ty=0; ty < ncclNumTypes; ty++) { for (int op=0; op < 4; op++) { diff --git a/src/transport/generic.cc b/src/transport/generic.cc index 47b023667..a42418773 100644 --- a/src/transport/generic.cc +++ b/src/transport/generic.cc @@ -1,3 +1,9 @@ +/************************************************************************* + * Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + #include "comm.h" #include "transport.h" #include "bootstrap.h" diff --git a/src/transport/net.cc b/src/transport/net.cc index c0cd20d6e..5d1a5601c 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -178,6 +178,101 @@ struct setupReq { NCCL_PARAM(NetOptionalRecvCompletion, "NET_OPTIONAL_RECV_COMPLETION", 1); static_assert(sizeof(ncclNetHandle_t) + sizeof(int) <= CONNECT_SIZE, "Not large enough ncclConnect to hold ncclNetHandle_t and useGdr flag"); + +// Common function to initialize network attributes from a ncclComm +static void populateCommNetAttrs(struct ncclComm* comm, struct ncclConnector* conn, ncclNetAttr_t* netAttr) { + *netAttr = NCCL_NET_ATTR_INIT; + netAttr->sendCommAttr.minConcurrentPeers = 1; + netAttr->sendCommAttr.minFlowsPerPeer = 1; + + netAttr->recvCommAttr.minConcurrentPeers = 1; + netAttr->recvCommAttr.minFlowsPerPeer = 1; + + if (conn->p2pOnly) { + size_t maxConcPeers = comm->p2pnChannels * NCCL_MAX_DEV_WORK_P2P_PER_BATCH; + if (comm->nRanks < maxConcPeers) maxConcPeers = comm->nRanks; + + netAttr->sendCommAttr.maxConcurrentPeers = maxConcPeers; + netAttr->sendCommAttr.maxFlowsPerPeer = comm->p2pnChannelsPerPeer; + netAttr->recvCommAttr.maxConcurrentPeers = maxConcPeers; + netAttr->recvCommAttr.maxFlowsPerPeer = comm->p2pnChannelsPerPeer; + netAttr->op = BIT(ncclFuncSend) | BIT(ncclFuncRecv) | + BIT(ncclFuncAlltoAll) | BIT(ncclFuncScatter) | BIT(ncclFuncGather); + } else { + size_t maxConcPeers = (NCCL_MAX_TREE_ARITY - 1) * 2; + if (comm->nRanks < maxConcPeers) maxConcPeers = comm->nRanks; + netAttr->sendCommAttr.maxConcurrentPeers = maxConcPeers; + netAttr->sendCommAttr.maxFlowsPerPeer = comm->nChannels; + netAttr->recvCommAttr.maxConcurrentPeers = maxConcPeers; + netAttr->recvCommAttr.maxFlowsPerPeer = comm->nChannels; + } +} + +// Apply the netAttr to the netComm +void setNetAttrs(struct ncclProxyState* proxyState, ncclNetAttr_t* netAttr) +{ + if (proxyState->ncclNet->setNetAttr) { + proxyState->ncclNet->setNetAttr(proxyState->netContext, netAttr); + proxyState->netAttr = *netAttr; + } +} + +void printNetAttrs(ncclNetAttr_t* netAttr, const char *task) +{ + const int opBufLen = ncclNumFuncs*32; + char opBuf[opBufLen] = ""; + const int algoBufLen = NCCL_NUM_ALGORITHMS*32; + char algoBuf[algoBufLen] = ""; + const int protoBufLen = NCCL_NUM_PROTOCOLS*32; + char protoBuf[protoBufLen] = ""; + + ncclBitsToString(netAttr->op, MASK(ncclNumFuncs), (const char* (*)(int))ncclFuncToString, opBuf, opBufLen, "*"); + ncclBitsToString(netAttr->algo, MASK(NCCL_NUM_ALGORITHMS), ncclAlgoToString, algoBuf, algoBufLen, "*"); + ncclBitsToString(netAttr->proto, MASK(NCCL_NUM_PROTOCOLS), ncclProtoToString, protoBuf, protoBufLen, "*"); + + TRACE(NCCL_NET, "%s hints, send peers/flows: [%d-%d][%d-%d] recv peers/flows: [%d-%d][%d-%d] op: %s algo: %s proto: %s", + task, netAttr->sendCommAttr.minConcurrentPeers, netAttr->sendCommAttr.maxConcurrentPeers, + netAttr->sendCommAttr.minFlowsPerPeer, netAttr->sendCommAttr.maxFlowsPerPeer, + netAttr->recvCommAttr.minConcurrentPeers, netAttr->recvCommAttr.maxConcurrentPeers, + netAttr->recvCommAttr.minFlowsPerPeer, netAttr->recvCommAttr.maxFlowsPerPeer, + opBuf, algoBuf, protoBuf); +} + +// Set the netAttr for a transfer operation +void setXferNetAttrs(struct ncclProxyState* proxyState, struct ncclProxyArgs* args, int send) +{ + ncclNetAttr_t netAttr; + + if (!proxyState->ncclNet->setNetAttr) + return; + + netAttr = proxyState->netAttr; + + if (send) { + netAttr.sendCommAttr.maxConcurrentPeers = args->nPeers; + netAttr.sendCommAttr.minConcurrentPeers = args->nPeers; + netAttr.sendCommAttr.maxFlowsPerPeer = args->nChannels; + netAttr.sendCommAttr.minFlowsPerPeer = args->nChannels; + } else { + netAttr.recvCommAttr.maxConcurrentPeers = args->nPeers; + netAttr.recvCommAttr.minConcurrentPeers = args->nPeers; + netAttr.recvCommAttr.maxFlowsPerPeer = args->nChannels; + netAttr.recvCommAttr.minFlowsPerPeer = args->nChannels; + } + + netAttr.op = BIT(args->collAPI); + // algo/proto are undefined for p2p + if (args->collAPI < NCCL_NUM_FUNCTIONS) { + netAttr.algo = BIT(args->algorithm); + netAttr.proto = BIT(args->protocol); + } + + if (memcmp(&proxyState->netAttr, &netAttr, sizeof(netAttr))) { + setNetAttrs(proxyState, &netAttr); + printNetAttrs(&netAttr, send ? "send" : "recv"); + } +} + // Forward declaration static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args); @@ -307,11 +402,12 @@ static ncclResult_t netDumpMap(struct connectMap* map) { struct netSendConnectArgs { ncclNetHandle_t handle; - int trafficClass; + ncclNetAttr_t netAttr; }; struct netRecvConnectArgs { int proxyRank; + ncclNetAttr_t netAttr; }; static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { @@ -331,7 +427,9 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId); netSendConnectArgs args = {0}; memcpy(&args.handle, connectInfo, sizeof(ncclNetHandle_t)); - args.trafficClass = comm->config.trafficClass; + + populateCommNetAttrs(comm, send, &args.netAttr); + NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, &args, sizeof(netSendConnectArgs), sizeof(struct connectMap), opId)); } else { opId = send; @@ -442,6 +540,9 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne opId, &recv->proxyConn, connectInfo); netRecvConnectArgs args = {0}; args.proxyRank = *((int*)connectInfo); + + populateCommNetAttrs(comm, recv, &args.netAttr); + NCCLCHECK(ncclProxyCallAsync(comm, &recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(netRecvConnectArgs), sizeof(struct connectMap), opId)); } else { opId = recv; @@ -677,7 +778,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc } if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError; - NCCLCHECK(proxyState->ncclNet->listen(req->netDev, respBuff, &resources->netListenComm)); + NCCLCHECK(proxyState->ncclNet->listen(proxyState->netContext, req->netDev, respBuff, &resources->netListenComm)); *done = 1; return ncclSuccess; @@ -707,11 +808,12 @@ static ncclResult_t ncclNetGetDeviceHandle(ncclNetDeviceType type, int version, static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources); - ncclNetCommConfig_t commConfig = {0}; if (reqSize != sizeof(netSendConnectArgs)) return ncclInternalError; ncclResult_t ret = ncclSuccess; netSendConnectArgs* req = (netSendConnectArgs*) reqBuff; - commConfig.trafficClass = req->trafficClass == NCCL_CONFIG_UNDEF_INT ? NCCL_NET_TRAFFIC_CLASS_UNDEF : req->trafficClass; + + setNetAttrs(proxyState, &req->netAttr); + NCCLCHECK(ncclNetGetDeviceHandle(resources->netDeviceType, resources->netDeviceVersion, false /*isRecv*/, &resources->netDeviceHandle)); if (resources->shared) { // Shared buffers @@ -736,25 +838,29 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str comms->activeConnect[resources->channelId] = (resources->tpLocalRank + 1); if (comms->sendComm[resources->channelId] == NULL && comms->activeConnect[resources->channelId] == (resources->tpLocalRank + 1)) { - ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, + ret = proxyState->ncclNet->connect(proxyState->netContext, resources->netDev, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle); } resources->netSendComm = comms->sendComm[resources->channelId]; if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++; } else { - ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle); + ret = proxyState->ncclNet->connect(proxyState->netContext, resources->netDev, req->handle, &resources->netSendComm, &resources->netDeviceHandle); } } else { // Connect to remote peer - ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle); + ret = proxyState->ncclNet->connect(proxyState->netContext, resources->netDev, req->handle, &resources->netSendComm, &resources->netDeviceHandle); connection->proxyAppendPtr = &connection->proxyAppend; } - NCCLCHECK(ret); + if (ret != ncclSuccess) { + if (resources->netSendComm) proxyState->ncclNet->closeSend(resources->netSendComm); + NCCLCHECK(ret); + } if (resources->netSendComm == NULL) { *done = 0; return ncclInProgress; } + printNetAttrs(&req->netAttr, "send connect"); *done = 1; if (resources->netDeviceHandle) { @@ -872,6 +978,8 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str resources->tpRemoteProxyRank = req->proxyRank; ncclResult_t ret = ncclSuccess; + setNetAttrs(proxyState, &req->netAttr); + NCCLCHECK(ncclNetGetDeviceHandle(resources->netDeviceType, resources->netDeviceVersion, true /*isRecv*/, &resources->netDeviceHandle)); // Finish connection establishment from remote peer if (resources->shared) { @@ -917,6 +1025,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str *done = 0; return ncclInProgress; } + printNetAttrs(&req->netAttr, "recv connect"); *done = 1; if (resources->netDeviceHandle) { @@ -1106,6 +1215,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps"); static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { + int checkedNetAttr = 0; if (args->state == ncclProxyOpReady) { for (int s=0; snsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; @@ -1207,6 +1317,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct // since size is a plain integer. // coverity[use_invalid:FALSE] void* phandle = &sub->pHandles[DIVUP(transmittedStepId, args->sliceSteps)%NCCL_STEPS]; + if (!checkedNetAttr++) + setXferNetAttrs(proxyState, args, 1); NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, phandle, sub->requests+buffSlot)); if (sub->requests[buffSlot] != NULL) { TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Isend posted, req %p, buff %p, size %d, proto %d, myRank %d, channelId %d, mhandle %p", sub->transmitted, buffSlot, sub->nsteps, sub->requests[buffSlot], buff, size, p, proxyState->tpRank, sub->channelId, sub->sendMhandle); @@ -1258,6 +1370,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct } static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) { + int checkedNetAttr = 0; if (args->state == ncclProxyOpReady) { // Initialize subs and group them by same recvComm. void* recvComm; @@ -1363,6 +1476,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct struct recvNetResources* resources = (struct recvNetResources*) (subGroup->connection->transportResources); void** requestPtr = subGroup->requests+(step%NCCL_STEPS); bool ignoreCompletion = ncclParamNetOptionalRecvCompletion() && ((args->protocol == NCCL_PROTO_LL128) || (args->protocol == NCCL_PROTO_LL)) && (subCount == 1); + if (!checkedNetAttr++) + setXferNetAttrs(proxyState, args, 0); if (ignoreCompletion) *requestPtr = (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION; NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, phandles, requestPtr)); if (*requestPtr) { @@ -1594,13 +1709,18 @@ ncclResult_t ncclNetLocalRegisterBuffer(ncclComm* comm, const void* userbuff, si ncclResult_t ret = ncclSuccess; struct ncclReg *regRecord = NULL; bool isValid = false; + void *base = NULL; + size_t baseSize = 0; *outRegBufFlag = 0; if (comm && userbuff && buffSize > 0 && nPeers > 0) { NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, ®Record), ret, fail); NCCLCHECKGOTO(ncclRegLocalIsValid(regRecord, &isValid), ret, fail); - if (isValid) + if (isValid) { + CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&base, &baseSize, (CUdeviceptr)userbuff), ret, fail); + if ((uint64_t)base + baseSize < (uint64_t)userbuff + buffSize) goto exit; NCCLCHECKGOTO(netRegisterBuffer(comm, userbuff, buffSize, peerConns, nPeers, regRecord, outRegBufFlag, outHandle), ret, fail); + } } exit: @@ -1627,13 +1747,14 @@ ncclResult_t ncclNetGraphRegisterBuffer(ncclComm* comm, const void* userbuff, si ncclResult_t ret = ncclSuccess; struct ncclNetCleanupCallback *record = NULL; struct ncclReg *regRecord = NULL; - void *baseSend; - size_t baseSendSize; + void *base = NULL; + size_t baseSize = 0; *outRegBufFlag = 0; if (comm && userbuff && buffSize > 0 && nPeers > 0) { - CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)userbuff), ret, fail); - NCCLCHECKGOTO(ncclCommGraphRegister(comm, baseSend, baseSendSize, (void**)®Record), ret, fail); + CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&base, &baseSize, (CUdeviceptr)userbuff), ret, fail); + if ((uint64_t)base + baseSize < (uint64_t)userbuff + buffSize) goto exit; + NCCLCHECKGOTO(ncclCommGraphRegister(comm, base, baseSize, (void**)®Record), ret, fail); NCCLCHECKGOTO(netRegisterBuffer(comm, userbuff, buffSize, peerConns, nPeers, regRecord, outRegBufFlag, outHandle), ret, fail); if (*outRegBufFlag) { NCCLCHECKGOTO(ncclCalloc(&record, 1), ret, fail); diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 709e7ad40..3614dec61 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -21,6 +21,7 @@ #include #include #include +#include #define ENABLE_TIMER 0 #include "timer.h" @@ -32,6 +33,8 @@ static char ncclIbIfName[MAX_IF_NAME_SIZE+1]; static union ncclSocketAddress ncclIbIfAddr; +static ncclNetCommConfig_t ibContext; + struct ncclIbMr { uintptr_t addr; size_t pages; @@ -70,7 +73,7 @@ const char* ibProviderName[] = { static int ncclNIbDevs = -1; struct alignas(64) ncclIbDev { - pthread_mutex_t lock; + std::mutex mutex; int device; uint64_t guid; uint8_t portNum; @@ -102,9 +105,16 @@ struct alignas(64) ncclIbDev { #define MAX_IB_VDEVS MAX_IB_DEVS*8 struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_VDEVS]; struct ncclIbDev ncclIbDevs[MAX_IB_DEVS]; -pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER; +static std::mutex ncclIbMutex; static int ncclIbRelaxedOrderingEnabled = 0; +// With ncclNet_v11_t the NCCL core initializes the network plugin per-communicator +// rather than once for all communicators. However, the internal plugin implementation +// still assumes the plugin is initialized only once across all communicators. The ref +// counter makes sure the plugin internally initializes only once. When per communicator +// context support is added to the plugin the ref counter can be removed. +static int netRefCount; + #define NCCL_IB_LLSTR(ll) (((ll) == IBV_LINK_LAYER_INFINIBAND) ? "IB" : (((ll) == IBV_LINK_LAYER_ETHERNET) ? "RoCE" : "UNSPECIFIED")) #define NCCL_IB_SL_DEFAULT 0 @@ -184,6 +194,9 @@ static void* ncclIbAsyncThreadMain(void* args) { // SRQ are not used in NCCL WARN("NET/IB : %s:%d async fatal event on SRQ, unused for now (%p): %s", dev->devName, dev->portNum, srq, str); break; + case IBV_EVENT_GID_CHANGE: + WARN("NET/IB : %s:%d GID table changed", dev->devName, dev->portNum); + break; case IBV_EVENT_PATH_MIG_ERR: case IBV_EVENT_PORT_ERR: case IBV_EVENT_PATH_MIG: @@ -597,15 +610,22 @@ ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props) { } ncclResult_t ncclIbMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { - pthread_mutex_lock(&ncclIbLock); + std::lock_guard lock(ncclIbMutex); ncclResult_t res = ncclIbMakeVDeviceInternal(d, props); - pthread_mutex_unlock(&ncclIbLock); return res; + +} + +ncclResult_t ncclIbSetNetAttr(void *ctx, ncclNetAttr_t *netAttr) { + (void)ctx; + (void)netAttr; + return ncclSuccess; } static ncclProfilerCallback_t ncclProfilerFunction; -ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { +ncclResult_t ncclIbInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { + if (netRefCount++) return ncclSuccess; ncclResult_t ret = ncclSuccess; ncclProfilerFunction = profFunction; if (ncclParamIbDisable()) return ncclInternalError; @@ -614,7 +634,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr if(wrap_mlx5dv_symbols() != ncclSuccess) { INFO(NCCL_NET, "NET/IB : Failed to open mlx5dv symbols. Advance features like CX-8 Direct-NIC will be disabled."); } if (ncclNIbDevs == -1) { - pthread_mutex_lock(&ncclIbLock); + std::lock_guard lock(ncclIbMutex); wrap_ibv_fork_init(); if (ncclNIbDevs == -1) { int nIpIfs = 0; @@ -644,25 +664,15 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) { ret = ncclInternalError; goto fail; } for (int d=0; dname); continue; } - enum ncclIbProvider ibProvider = IB_PROVIDER_NONE; - char dataDirectDevicePath[PATH_MAX]; - int dataDirectSupported = 0; - int skipNetDevForDataDirect = 0; - if (wrap_mlx5dv_is_supported(devices[d])) { - ibProvider = IB_PROVIDER_MLX5; - snprintf(dataDirectDevicePath, PATH_MAX, "/sys"); - if((ncclMlx5dvDmaBufCapable(context)) && (wrap_mlx5dv_get_data_direct_sysfs_path(context, dataDirectDevicePath + 4, PATH_MAX - 4) == ncclSuccess)) { - INFO(NCCL_INIT|NCCL_NET, "NET/IB: Data Direct DMA Interface is detected for device:%s", devices[d]->name); - // Now check whether Data Direct has been disabled by the user - if(ncclParamIbDataDirect() == 1) { dataDirectSupported = 1; skipNetDevForDataDirect = 1; } - if(ncclParamIbDataDirect() == 2) { dataDirectSupported = 1; skipNetDevForDataDirect = 0; } - } - } + char dataDirectDevicePath[PATH_MAX] = "/sys"; + int devCount = /*undefined*/-1, devOffset = 0; + enum ncclIbProvider ibProvider = wrap_mlx5dv_is_supported(devices[d]) ? IB_PROVIDER_MLX5 : IB_PROVIDER_NONE; + int nPorts = 0; struct ibv_device_attr devAttr; memset(&devAttr, 0, sizeof(devAttr)); @@ -672,78 +682,99 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr continue; } for (int port_num = 1; port_num <= devAttr.phys_port_cnt; port_num++) { - // dataDirect = 0 exposes the devices normally, dataDirect = 1 exposes the devices through direct NIC - for (int dataDirect = skipNetDevForDataDirect; dataDirect < 1 + dataDirectSupported; ++dataDirect) { struct ibv_port_attr portAttr; if (ncclSuccess != wrap_ibv_query_port(context, port_num, &portAttr)) { WARN("NET/IB : Unable to query port_num %d", port_num); continue; } if (portAttr.state != IBV_PORT_ACTIVE) continue; - if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND - && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue; + if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue; // check against user specified HCAs/ports if (! (matchIfList(devices[d]->name, port_num, userIfs, nUserIfs, searchExact) ^ searchNot)) { continue; } - pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL); - ncclIbDevs[ncclNIbDevs].device = d; - ncclIbDevs[ncclNIbDevs].ibProvider = ibProvider; - ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid; - ncclIbDevs[ncclNIbDevs].portAttr = portAttr; - ncclIbDevs[ncclNIbDevs].portNum = port_num; - ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer; - if (portAttr.active_speed_ex) - // A non-zero active_speed_ex indicates XDR rate (0x100) or higher - ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed_ex) * ncclIbWidth(portAttr.active_width); - else - ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width); - ncclIbDevs[ncclNIbDevs].context = context; - ncclIbDevs[ncclNIbDevs].pdRefs = 0; - ncclIbDevs[ncclNIbDevs].pd = NULL; - if (!dataDirect) { - strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE); - NCCLCHECKGOTO(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort), ret, fail); - } else { - snprintf(ncclIbDevs[ncclNIbDevs].devName, MAXNAMESIZE, "%s_dma", devices[d]->name); - NCCLCHECK(ncclCalloc(&ncclIbDevs[ncclNIbDevs].pciPath, PATH_MAX)); - strncpy(ncclIbDevs[ncclNIbDevs].pciPath, dataDirectDevicePath, PATH_MAX); - ncclIbDevs[ncclNIbDevs].capsProvider.mlx5.dataDirect = 1; + + // check for mlx5 data direct support only once for a each device + if (devCount == -1) { + devCount = 1; + devOffset = 0; + if (ncclParamIbDataDirect() > 0 && ibProvider == IB_PROVIDER_MLX5 && ncclMlx5dvDmaBufCapable(context)) { + int pathLen = strlen(dataDirectDevicePath); + ncclResult_t res = wrap_mlx5dv_get_data_direct_sysfs_path(context, dataDirectDevicePath + pathLen, sizeof(dataDirectDevicePath) - pathLen); + if (res == ncclSuccess) { + // data direct devices are exposed twice: with the C2C + PCIe link and with the data direct link + devCount = 2; + // by default only expose the data direct NIC (devOffset = 1), unless set to 2 by the user + devOffset = (ncclParamIbDataDirect() == 2) ? 0 : 1; + INFO(NCCL_INIT | NCCL_NET, "NET/IB: Data Direct DMA Interface is detected for device %s", devices[d]->name); + } else if (res == ncclInvalidArgument) { + TRACE(NCCL_NET, "NET/IB: Device %s does not support Data Direct DMA.", devices[d]->name); + } else { + WARN("NET/IB: Error in mlx5dv_get_data_direct_sysfs_path with device %s", devices[d]->name); + return res; + } + } + } + for (int dev = devOffset; dev < devCount; ++dev) { + ncclIbDevs[ncclNIbDevs].device = d; + ncclIbDevs[ncclNIbDevs].ibProvider = ibProvider; + ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid; + ncclIbDevs[ncclNIbDevs].portAttr = portAttr; + ncclIbDevs[ncclNIbDevs].portNum = port_num; + ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer; + if (portAttr.active_speed_ex) { + // A non-zero active_speed_ex indicates XDR rate (0x100) or higher + ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed_ex) * ncclIbWidth(portAttr.active_width); + } else { + ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width); + } + ncclIbDevs[ncclNIbDevs].context = context; + ncclIbDevs[ncclNIbDevs].pdRefs = 0; + ncclIbDevs[ncclNIbDevs].pd = NULL; + if (dev == 0) { + strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE); + NCCLCHECKGOTO(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort), ret, fail); + } else { + snprintf(ncclIbDevs[ncclNIbDevs].devName, MAXNAMESIZE, "%s_dma", devices[d]->name); + NCCLCHECK(ncclCalloc(&ncclIbDevs[ncclNIbDevs].pciPath, PATH_MAX)); + strncpy(ncclIbDevs[ncclNIbDevs].pciPath, dataDirectDevicePath, PATH_MAX); + ncclIbDevs[ncclNIbDevs].capsProvider.mlx5.dataDirect = 1; + } + ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp; + ncclIbDevs[ncclNIbDevs].mrCache.capacity = 0; + ncclIbDevs[ncclNIbDevs].mrCache.population = 0; + ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL; + NCCLCHECK(ncclIbStatsInit(&ncclIbDevs[ncclNIbDevs].stats)); + + // Enable ADAPTIVE_ROUTING by default on IB networks + // But allow it to be overloaded by an env parameter + ncclIbDevs[ncclNIbDevs].ar = (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND) ? 1 : 0; + if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting(); + + INFO(NCCL_NET, "NET/IB: [%d] %s:%s:%d/%s provider=%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, + ncclIbDevs[ncclNIbDevs].portNum, NCCL_IB_LLSTR(portAttr.link_layer), ibProviderName[ncclIbDevs[ncclNIbDevs].ibProvider], ncclIbDevs[ncclNIbDevs].speed, context, + ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar); + + PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail); + ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs); + PTHREADCHECKGOTO(pthread_detach(ncclIbAsyncThread), "pthread_detach", ret, fail); // will not be pthread_join()'d + + // Add this plain physical device to the list of virtual devices + int vDev; + ncclNetVDeviceProps_t vProps = {0}; + vProps.ndevs = 1; + vProps.devs[0] = ncclNIbDevs; + NCCLCHECK(ncclIbMakeVDeviceInternal(&vDev, &vProps)); + + ncclNIbDevs++; + nPorts++; } - ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp; - ncclIbDevs[ncclNIbDevs].mrCache.capacity = 0; - ncclIbDevs[ncclNIbDevs].mrCache.population = 0; - ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL; - NCCLCHECK(ncclIbStatsInit(&ncclIbDevs[ncclNIbDevs].stats)); - - // Enable ADAPTIVE_ROUTING by default on IB networks - // But allow it to be overloaded by an env parameter - ncclIbDevs[ncclNIbDevs].ar = (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND) ? 1 : 0; - if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting(); - - INFO(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s provider=%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum, - NCCL_IB_LLSTR(portAttr.link_layer), ibProviderName[ncclIbDevs[ncclNIbDevs].ibProvider], ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar); - - PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail); - ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs); - PTHREADCHECKGOTO(pthread_detach(ncclIbAsyncThread), "pthread_detach", ret, fail); // will not be pthread_join()'d - - // Add this plain physical device to the list of virtual devices - int vDev; - ncclNetVDeviceProps_t vProps = {0}; - vProps.ndevs = 1; - vProps.devs[0] = ncclNIbDevs; - NCCLCHECK(ncclIbMakeVDeviceInternal(&vDev, &vProps)); - - ncclNIbDevs++; - nPorts++; - } } if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { ret = ncclInternalError; goto fail; } } - if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { ret = ncclInternalError; goto fail; }; + if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { ret = ncclInternalError; goto fail; } } if (ncclNIbDevs == 0) { INFO(NCCL_INIT|NCCL_NET, "NET/IB : No device found."); @@ -762,12 +793,12 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "", ncclIbIfName, ncclSocketToString(&ncclIbIfAddr, addrline)); - pthread_mutex_unlock(&ncclIbLock); } exit: + ibContext.trafficClass = config->trafficClass; + *ctx = &ibContext; return ret; fail: - pthread_mutex_unlock(&ncclIbLock); goto exit; } @@ -789,8 +820,8 @@ static void ibGdrSupportInitOnce() { KNL_MODULE_LOADED("/sys/module/nvidia_peermem/version"); } ncclResult_t ncclIbGdrSupport() { - static pthread_once_t once = PTHREAD_ONCE_INIT; - pthread_once(&once, ibGdrSupportInitOnce); + static std::once_flag once; + std::call_once(once, ibGdrSupportInitOnce); if (!ncclIbGdrModuleLoaded) return ncclSystemError; return ncclSuccess; @@ -825,13 +856,10 @@ static void ibDmaBufSupportInitOnce(){ // ncclSuccess : DMA-BUF support is available // ncclSystemError : DMA-BUF is not supported by the kernel ncclResult_t ncclIbDmaBufSupport(int dev) { - struct oncewrap { - pthread_once_t once = PTHREAD_ONCE_INIT; - }; - static oncewrap onces[MAX_IB_DEVS]; + static std::once_flag onces[MAX_IB_DEVS]; // init the device only once ibDmaSupportInitDev = dev; - pthread_once(&onces[dev].once, ibDmaBufSupportInitOnce); + std::call_once(onces[dev], ibDmaBufSupportInitOnce); ncclIbMergedDev* mergedDev = ncclIbMergedDevs + ibDmaSupportInitDev; ncclIbDev* ibDev = ncclIbDevs + mergedDev->vProps.devs[0]; int dmaBufSupported = ibDev->dmaBufSupported; @@ -843,7 +871,7 @@ ncclResult_t ncclIbDmaBufSupport(int dev) { ncclResult_t ncclIbGetPhysProperties(int dev, ncclNetProperties_t* props) { struct ncclIbDev* ibDev = ncclIbDevs + dev; - pthread_mutex_lock(&ibDev->lock); + std::lock_guard lock(ibDev->mutex); props->name = ibDev->devName; props->speed = ibDev->speed; props->pciPath = ibDev->pciPath; @@ -867,7 +895,8 @@ ncclResult_t ncclIbGetPhysProperties(int dev, ncclNetProperties_t* props) { props->netDeviceType = NCCL_NET_DEVICE_HOST; props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES; - pthread_mutex_unlock(&ibDev->lock); + props->maxCollBytes = MAX_COLLNET_SIZE; + props->maxMultiRequestSize = 1; return ncclSuccess; } @@ -1132,18 +1161,13 @@ static void ncclIbAddEvent(struct ncclIbRequest* req, int devIndex, struct ncclI ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base, void* cq_context) { base->ibDevN = ibDevN; ncclIbDev* ibDev = ncclIbDevs + ibDevN; - pthread_mutex_lock(&ibDev->lock); - if (0 == ibDev->pdRefs++) { - ncclResult_t res; - NCCLCHECKGOTO(wrap_ibv_alloc_pd(&ibDev->pd, ibDev->context), res, failure); - if (0) { - failure: - pthread_mutex_unlock(&ibDev->lock); - return res; + { + std::lock_guard lock(ibDev->mutex); + if (0 == ibDev->pdRefs++) { + NCCLCHECK(wrap_ibv_alloc_pd(&ibDev->pd, ibDev->context)); } + base->pd = ibDev->pd; } - base->pd = ibDev->pd; - pthread_mutex_unlock(&ibDev->lock); // Recv requests can generate 2 completions (one for the post FIFO, one for the Recv). NCCLCHECK(wrap_ibv_create_cq(&base->cq, ibDev->context, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), cq_context, NULL, 0)); @@ -1152,17 +1176,13 @@ ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base } ncclResult_t ncclIbDestroyBase(struct ncclIbNetCommDevBase* base) { - ncclResult_t res; NCCLCHECK(wrap_ibv_destroy_cq(base->cq)); - pthread_mutex_lock(&ncclIbDevs[base->ibDevN].lock); + std::lock_guard lock(ncclIbDevs[base->ibDevN].mutex); if (0 == --ncclIbDevs[base->ibDevN].pdRefs) { - NCCLCHECKGOTO(wrap_ibv_dealloc_pd(ncclIbDevs[base->ibDevN].pd), res, returning); + NCCLCHECK(wrap_ibv_dealloc_pd(ncclIbDevs[base->ibDevN].pd)); } - res = ncclSuccess; -returning: - pthread_mutex_unlock(&ncclIbDevs[base->ibDevN].lock); - return res; + return ncclSuccess; } ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, int access_flags, void* qp_context, struct ncclIbQp* qp) { @@ -1250,7 +1270,7 @@ ncclResult_t ncclIbRtsQp(struct ibv_qp* qp) { return ncclSuccess; } -ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) { +ncclResult_t ncclIbListen(void* ctx, int dev, void* opaqueHandle, void** listenComm) { ncclResult_t ret = ncclSuccess; struct ncclIbListenComm* comm; NCCLCHECK(ncclCalloc(&comm, 1)); @@ -1271,7 +1291,7 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) { goto exit; } -ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { +ncclResult_t ncclIbConnect(void* ctx, int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { ncclResult_t ret = ncclSuccess; struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle; struct ncclIbCommStage* stage = &handle->stage; @@ -1333,6 +1353,7 @@ ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHan if (stage->offset != sizeof(ncclNetVDeviceProps_t)) return ncclSuccess; stage->offset = 0; ncclNetVDeviceProps_t remoteVProps; + ncclNetCommConfig_t* config; memcpy(&remoteVProps, stage->buffer, sizeof(ncclNetVDeviceProps_t)); mergedDev = ncclIbMergedDevs + dev; comm->base.vProps = mergedDev->vProps; @@ -1425,6 +1446,7 @@ ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHan return ncclInternalError; } } + config = (ncclNetCommConfig_t*)ctx; meta.fifoAddr = (uint64_t)comm->fifo; meta.sl = (ncclParamIbSl() != -1) ? ncclParamIbSl() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_SL_DEFAULT; meta.tc = (ncclParamIbTc() != -1) ? ncclParamIbTc() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_TC_DEFAULT; @@ -1856,13 +1878,12 @@ ncclResult_t ncclIbRegMrDmaBufInternal(ncclIbNetCommDevBase* base, void* data, s struct ncclIbMrCache* cache = &ncclIbDevs[base->ibDevN].mrCache; uintptr_t addr = (uintptr_t)data & -pageSize; size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize; - ncclResult_t res; - pthread_mutex_lock(&ncclIbDevs[base->ibDevN].lock); + std::lock_guard lock(ncclIbDevs[base->ibDevN].mutex); for (int slot=0; /*true*/; slot++) { if (slot == cache->population || addr < cache->slots[slot].addr) { // didn't find in cache if (cache->population == cache->capacity) { // must grow cache cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity; - NCCLCHECKGOTO(ncclRealloc(&cache->slots, cache->population, cache->capacity), res, returning); + NCCLCHECK(ncclRealloc(&cache->slots, cache->population, cache->capacity)); } // Deregister / register struct ibv_mr* mr; @@ -1871,17 +1892,17 @@ ncclResult_t ncclIbRegMrDmaBufInternal(ncclIbNetCommDevBase* base, void* data, s if (fd != -1) { /* DMA-BUF support */ if (!ncclIbDevs[base->ibDevN].capsProvider.mlx5.dataDirect) { - NCCLCHECKGOTO(wrap_ibv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags), res, returning); + NCCLCHECK(wrap_ibv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags)); } else { - NCCLCHECKGOTO(wrap_mlx5dv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags, MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT), res, returning); + NCCLCHECK(wrap_mlx5dv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags, MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT)); } } else { if (ncclIbRelaxedOrderingEnabled) { // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support - NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, base->pd, (void*)addr, pages*pageSize, addr, flags), res, returning); + NCCLCHECK(wrap_ibv_reg_mr_iova2(&mr, base->pd, (void*)addr, pages*pageSize, addr, flags)); } else { - NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, base->pd, (void*)addr, pages*pageSize, flags), res, returning); + NCCLCHECK(wrap_ibv_reg_mr(&mr, base->pd, (void*)addr, pages*pageSize, flags)); } } TRACE(NCCL_INIT|NCCL_NET,"regAddr=0x%lx size=%lld rkey=0x%x lkey=0x%x fd=%d", (unsigned long)addr, (long long)pages*pageSize, mr->rkey, mr->lkey, fd); @@ -1892,19 +1913,15 @@ ncclResult_t ncclIbRegMrDmaBufInternal(ncclIbNetCommDevBase* base, void* data, s cache->slots[slot].mr = mr; cache->population += 1; *mhandle = mr; - res = ncclSuccess; - goto returning; + return ncclSuccess; } else if ((addr >= cache->slots[slot].addr) && ((addr-cache->slots[slot].addr)/pageSize+pages) <= cache->slots[slot].pages) { cache->slots[slot].refs += 1; *mhandle = cache->slots[slot].mr; - res = ncclSuccess; - goto returning; + return ncclSuccess; } } -returning: - pthread_mutex_unlock(&ncclIbDevs[base->ibDevN].lock); - return res; + return ncclSuccess; } struct ncclIbNetCommDevBase* ncclIbGetNetCommDevBase(ncclIbNetCommBase* base, int devIndex) { @@ -1942,8 +1959,7 @@ ncclResult_t ncclIbRegMr(void* comm, void* data, size_t size, int type, void** m ncclResult_t ncclIbDeregMrInternal(ncclIbNetCommDevBase* base, ibv_mr* mhandle) { struct ncclIbMrCache* cache = &ncclIbDevs[base->ibDevN].mrCache; - ncclResult_t res; - pthread_mutex_lock(&ncclIbDevs[base->ibDevN].lock); + std::lock_guard lock(ncclIbDevs[base->ibDevN].mutex); for (int i=0; i < cache->population; i++) { if (mhandle == cache->slots[i].mr) { if (0 == --cache->slots[i].refs) { @@ -1953,17 +1969,13 @@ ncclResult_t ncclIbDeregMrInternal(ncclIbNetCommDevBase* base, ibv_mr* mhandle) cache->slots = NULL; cache->capacity = 0; } - NCCLCHECKGOTO(wrap_ibv_dereg_mr(mhandle), res, returning); + NCCLCHECK(wrap_ibv_dereg_mr(mhandle)); } - res = ncclSuccess; - goto returning; + return ncclSuccess; } } WARN("NET/IB: could not find mr %p inside cache of %d entries", mhandle, cache->population); - res = ncclInternalError; -returning: - pthread_mutex_unlock(&ncclIbDevs[base->ibDevN].lock); - return res; + return ncclInternalError; } ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) { @@ -2567,6 +2579,11 @@ ncclResult_t ncclIbCloseListen(void* listenComm) { return ncclSuccess; } +ncclResult_t ncclIbFinalize(void* ctx) { + netRefCount--; + return ncclSuccess; +} + ncclNet_t ncclNetIb = { "IB", ncclIbInit, @@ -2587,7 +2604,9 @@ ncclNet_t ncclNetIb = { ncclIbCloseListen, NULL /* getDeviceMr */, NULL /* irecvConsumed */, - ncclIbMakeVDevice + ncclIbMakeVDevice, + ncclIbFinalize, + ncclIbSetNetAttr, }; /* diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc index 985810c47..fa331aae2 100644 --- a/src/transport/net_socket.cc +++ b/src/transport/net_socket.cc @@ -16,6 +16,8 @@ #include #include #include +#include +#include /* Init functions */ static int ncclNetIfs = -1; @@ -26,7 +28,7 @@ struct ncclNetSocketDev { }; static struct ncclNetSocketDev ncclNetSocketDevs[MAX_IFS]; -pthread_mutex_t ncclNetSocketLock = PTHREAD_MUTEX_INITIALIZER; +static std::mutex ncclNetSocketMutex; static ncclResult_t ncclNetSocketGetPciPath(char* devName, char** pciPath) { char devicePath[PATH_MAX]; @@ -38,17 +40,24 @@ static ncclResult_t ncclNetSocketGetPciPath(char* devName, char** pciPath) { static ncclProfilerCallback_t ncclProfilerFunction; -ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { +// With ncclNet_v11_t the NCCL core initializes the network plugin per-communicator +// rather than once for all communicators. However, the internal plugin implementation +// still assumes the plugin is initialized only once across all communicators. The ref +// counter makes sure the plugin internally initializes only once. When per communicator +// context support is added to the plugin the ref counter can be removed. +static int netRefCount; + +ncclResult_t ncclNetSocketInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { + if (netRefCount++) return ncclSuccess; ncclProfilerFunction = profFunction; if (ncclNetIfs == -1) { - pthread_mutex_lock(&ncclNetSocketLock); + std::lock_guard lock(ncclNetSocketMutex); if (ncclNetIfs == -1) { char names[MAX_IF_NAME_SIZE*MAX_IFS]; union ncclSocketAddress addrs[MAX_IFS]; NCCLCHECK(ncclFindInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS, &ncclNetIfs)); if (ncclNetIfs <= 0) { WARN("NET/Socket : no interface found"); - pthread_mutex_unlock(&ncclNetSocketLock); return ncclInternalError; } else { #define MAX_LINE_LEN (2047) @@ -67,7 +76,6 @@ ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction, ncclProfilerCallba INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line); } } - pthread_mutex_unlock(&ncclNetSocketLock); } return ncclSuccess; } @@ -116,6 +124,8 @@ ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) { props->netDeviceType = NCCL_NET_DEVICE_HOST; props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION; props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES; + props->maxCollBytes = MAX_COLLNET_SIZE; + props->maxMultiRequestSize = 1; return ncclSuccess; } @@ -193,8 +203,8 @@ struct ncclNetSocketThreadResources { int stop; struct ncclNetSocketComm* comm; struct ncclProfilerInfo* pInfo; - pthread_mutex_t threadLock; - pthread_cond_t threadCond; + std::mutex threadMutex; + std::condition_variable threadCond; }; struct ncclNetSocketListenComm { @@ -269,11 +279,8 @@ void* persistentSocketThread(void *args_) { } while (repeat); } if (idle) { - pthread_mutex_lock(&resource->threadLock); - while (mark == myQueue->next && resource->stop == 0) { // no new tasks, wait - pthread_cond_wait(&resource->threadCond, &resource->threadLock); - } - pthread_mutex_unlock(&resource->threadLock); + std::unique_lock lock(resource->threadMutex); + resource->threadCond.wait(lock, [&] { return mark != myQueue->next || resource->stop; }); } if (resource->stop) return NULL; } @@ -335,7 +342,7 @@ ncclResult_t ncclNetSocketGetNsockNthread(int dev, int* ns, int* nt) { goto exit; } -ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) { +ncclResult_t ncclNetSocketListen(void* ctx, int dev, void* opaqueHandle, void** listenComm) { if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev WARN("NET/Socket : ncclNetSocketListen dev=%d ncclNetIfs=%d", dev, ncclNetIfs); return ncclInternalError; @@ -364,7 +371,7 @@ ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) } #define SOCKET_CTRL_SIZE (sizeof(int)) -ncclResult_t ncclNetSocketConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { +ncclResult_t ncclNetSocketConnect(void* ctx, int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) { if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev return ncclInternalError; } @@ -380,7 +387,7 @@ ncclResult_t ncclNetSocketConnect(int dev, ncclNetCommConfig_t* config, void* op if (stage->state == ncclNetSocketCommStateConnect) goto socket_connect_check; if (stage->state == ncclNetSocketCommStateSend) goto socket_send; - NCCLCHECK(ncclCalloc(&comm, 1)); + comm = new ncclNetSocketComm(); stage->comm = comm; comm->nSocks = handle->nSocks; comm->nThreads = handle->nThreads; @@ -422,7 +429,7 @@ ncclResult_t ncclNetSocketAccept(void* listenComm, void** recvComm, ncclNetDevic if (stage->state == ncclNetSocketCommStateAccept) goto socket_accept_check; if (stage->state == ncclNetSocketCommStateRecv) goto socket_recv; - NCCLCHECK(ncclCalloc(&rComm, 1)); + rComm = new ncclNetSocketComm(); stage->comm = rComm; rComm->nSocks = lComm->nSocks; rComm->nThreads = lComm->nThreads; @@ -449,9 +456,9 @@ ncclResult_t ncclNetSocketAccept(void* listenComm, void** recvComm, ncclNetDevic if (done == 0) return ncclSuccess; if (sendSockIdx == rComm->nSocks) - memcpy(&rComm->ctrlSock, sock, sizeof(struct ncclSocket)); + rComm->ctrlSock = *sock; else - memcpy(rComm->socks+sendSockIdx, sock, sizeof(struct ncclSocket)); + rComm->socks[sendSockIdx] = *sock; free(sock); } NCCLCHECK(ncclCalloc(&rComm->inlineData, MAX_REQUESTS * (SOCKET_CTRL_SIZE + ncclParamSocketInlineSize()))); @@ -501,8 +508,6 @@ ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, struct ncclPro #ifdef NCCL_ENABLE_NET_PROFILING res->pInfo = pInfo; #endif - pthread_mutex_init(&res->threadLock, NULL); - pthread_cond_init(&res->threadCond, NULL); PTHREADCHECK(pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res), "pthread_create"); ncclSetThreadName(comm->helperThread[tid], "NCCL Sock%c%1u%2u%2u", op == NCCL_SOCKET_SEND ? 'S' : 'R', comm->dev, tid, comm->cudaDev); } @@ -517,10 +522,9 @@ ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, struct ncclPro comm->nextSock = (comm->nextSock + 1) % comm->nSocks; r->used = 1; *req = r; - pthread_mutex_lock(&res->threadLock); + std::lock_guard lock(res->threadMutex); queue->next = (queue->next+1)%queue->len; - pthread_cond_signal(&res->threadCond); - pthread_mutex_unlock(&res->threadLock); + res->threadCond.notify_one(); return ncclSuccess; } WARN("NET/Socket : unable to allocate subtasks"); @@ -686,10 +690,11 @@ ncclResult_t ncclNetSocketClose(void* opaqueComm) { for (int i=0; inThreads; i++) { struct ncclNetSocketThreadResources* res = comm->threadResources+i; if (comm->helperThread[i]) { - pthread_mutex_lock(&res->threadLock); - res->stop = 1; - pthread_cond_signal(&res->threadCond); - pthread_mutex_unlock(&res->threadLock); + { + std::lock_guard lock(res->threadMutex); + res->stop = 1; + res->threadCond.notify_one(); + } PTHREADCHECK(pthread_join(comm->helperThread[i], NULL), "pthread_join"); } free(res->threadTaskQueue.tasks); @@ -702,11 +707,16 @@ ncclResult_t ncclNetSocketClose(void* opaqueComm) { if (ready) NCCLCHECK(ncclSocketClose(&comm->socks[i])); } if(comm->inlineData) free(comm->inlineData); - free(comm); + delete comm; } return ncclSuccess; } +ncclResult_t ncclNetSocketFinalize(void* ctx) { + netRefCount--; + return ncclSuccess; +} + ncclNet_t ncclNetSocket = { "Socket", ncclNetSocketInit, @@ -727,5 +737,7 @@ ncclNet_t ncclNetSocket = { ncclNetSocketCloseListen, NULL /* getDeviceMr */, NULL /* irecvConsumed */, - NULL /* mergeDevices */ + NULL /* mergeDevices */, + ncclNetSocketFinalize, + NULL /* setNetAttr */, }; diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc index da8d263f1..1f13bb01b 100644 --- a/src/transport/nvls.cc +++ b/src/transport/nvls.cc @@ -48,7 +48,7 @@ struct ncclTransport nvlsTransport = { { NULL, NULL, nvlsRecvFree, NULL, NULL, NULL, NULL, NULL } }; -ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop, int rank, unsigned int nranks, CUmemGenericAllocationHandle *mcHandle, char *shareableHandle) { +ncclResult_t ncclNvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop, int rank, unsigned int nranks, CUmemGenericAllocationHandle *mcHandle, char *shareableHandle) { CUmemAllocationHandleType type = ncclCuMemHandleType; size_t size = prop->size; @@ -70,7 +70,7 @@ ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop, return ncclSuccess; } -ncclResult_t nvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int rank, CUmemGenericAllocationHandle *mcHandle) { +ncclResult_t ncclNvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int rank, CUmemGenericAllocationHandle *mcHandle) { CUmemAllocationHandleType type = ncclCuMemHandleType; int fd = -1; ncclResult_t ret = ncclSuccess; @@ -205,7 +205,7 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) { ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm) { ncclResult_t ret = ncclSuccess; if (comm && comm->nvlsSupport && comm->nNodes > 1) { - for (int c = 0; c < comm->nChannels; c++) { + for (int c = 0; c < comm->nvlsChannels; c++) { struct ncclChannel* channel = comm->channels + c; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 1, &channel->nvls.treeUp, 0), ret, fail); NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->nvls.treeUp, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 0), ret, fail); @@ -242,12 +242,12 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, const CUmemAccessDesc mcprop.size = mcsize; if (comm->localRank == 0) { - NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, mcHandle, shareableHandle), ret, fail); + NCCLCHECKGOTO(ncclNvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, mcHandle, shareableHandle), ret, fail); allocMcHandle = 1; NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); } else { NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); - NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], mcHandle), ret, fail); + NCCLCHECKGOTO(ncclNvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], mcHandle), ret, fail); allocMcHandle = 1; } @@ -330,7 +330,7 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) { nHeads = comm->channels[0].nvls.nHeads; headRank = comm->channels[0].nvls.headRank; resources = comm->nvlsResources; - nChannels = comm->nvlsResources->nChannels; + nChannels = comm->nvlsChannels; nvlsStepSize = comm->nvlsChunkSize; buffSize = nvlsStepSize * NCCL_STEPS; nvlsPerRankSize = nChannels * 2 * buffSize; @@ -391,7 +391,7 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { if (nvlsShare) { /* reuse NVLS resources */ comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels); - for (int c = 0; c < comm->nChannels; c++) { + for (int c = 0; c < comm->nvlsChannels; c++) { NCCLCHECKGOTO(initNvlsChannel(comm, c, parent, true), res, fail); } @@ -400,7 +400,7 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { } else { struct ncclNvlsSharedRes* resources = NULL; int nHeads = comm->channels[0].nvls.nHeads; - int nChannels = comm->nChannels; + int nChannels = comm->nvlsChannels; size_t memSize = 64; size_t creditSize = nChannels * 2 * memSize * nHeads; int nvlsStepSize = comm->nvlsChunkSize; @@ -420,7 +420,7 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { } comm->nvlsResources->nChannels = comm->nvlsChannels; - for (int c = 0; c < comm->nChannels; c++) { + for (int c = 0; c < nChannels; c++) { NCCLCHECKGOTO(initNvlsChannel(comm, c, NULL, false), res, fail); } @@ -486,21 +486,21 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) { // MNNVL does not support NVLS buffer registration if (!comm->MNNVL && comm->nvlsResources->nvlsShmemHandle == NULL) { /* create shared memory for fast NVLS buffer registration */ - typeSize = sizeof(struct localRegData) << 1; + typeSize = DIVUP(sizeof(struct localRegData) << 1, CACHE_LINE_SIZE) * CACHE_LINE_SIZE; if (comm->localRank == 0) { shmPath[0] = '\0'; - NCCLCHECKGOTO(ncclShmOpen(shmPath, sizeof(shmPath), (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsResources->nvlsShmemHandle), res, fail); + NCCLCHECKGOTO(ncclShmOpen(shmPath, sizeof(shmPath), (CACHE_LINE_SIZE * comm->localRanks + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsResources->nvlsShmemHandle), res, fail); NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, fail); } else { NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, fail); - NCCLCHECKGOTO(ncclShmOpen(shmPath, sizeof(shmPath), (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, -1, &comm->nvlsResources->nvlsShmemHandle), res, fail); + NCCLCHECKGOTO(ncclShmOpen(shmPath, sizeof(shmPath), (CACHE_LINE_SIZE * comm->localRanks + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, -1, &comm->nvlsResources->nvlsShmemHandle), res, fail); } /* need 2 pools and a shared counter for shmem-based collectives */ comm->nvlsResources->nvlsShmem.cnt[0] = (size_t*)nvlsShmem; - comm->nvlsResources->nvlsShmem.ptr[0] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[0] + sizeof(size_t)); + comm->nvlsResources->nvlsShmem.ptr[0] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[0] + CACHE_LINE_SIZE * comm->localRanks); comm->nvlsResources->nvlsShmem.cnt[1] = (size_t*)((char*)comm->nvlsResources->nvlsShmem.ptr[0] + typeSize * comm->localRanks); - comm->nvlsResources->nvlsShmem.ptr[1] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[1] + sizeof(size_t)); + comm->nvlsResources->nvlsShmem.ptr[1] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[1] + CACHE_LINE_SIZE * comm->localRanks); comm->nvlsResources->nvlsShmem.round = 0; comm->nvlsResources->nvlsShmem.maxTypeSize = typeSize; } @@ -607,11 +607,11 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t mcprop.size = mcsize; if (comm->localRank == 0) { - NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &mcHandle, shareableHandle), ret, fail); + NCCLCHECKGOTO(ncclNvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &mcHandle, shareableHandle), ret, fail); NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); } else { NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); - NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &mcHandle), ret, fail); + NCCLCHECKGOTO(ncclNvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &mcHandle), ret, fail); } CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->nvlsResources->dev), ret, fail); @@ -751,17 +751,36 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send struct ncclReg *recvRegRecord = NULL; bool sendIsValid = false; bool recvIsValid = false; + void *baseSend = NULL; + void *baseRecv = NULL; + size_t baseSendSize = 0; + size_t baseRecvSize = 0; *outRegBufUsed = 0; if (sendbuff) { NCCLCHECK(ncclRegFind(comm, sendbuff, sendbuffSize, &sendRegRecord)); NCCLCHECK(ncclRegLocalIsValid(sendRegRecord, &sendIsValid)); + if (sendIsValid) { + CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)sendbuff)); + if ((uint64_t)baseSend + baseSendSize < (uint64_t)sendbuff + sendbuffSize) { + // the virtual address is backed by multiple physical memory regions, just fall back to non-UB path + goto exit; + } + } } else { sendIsValid = true; } + if (recvbuff) { NCCLCHECK(ncclRegFind(comm, recvbuff, recvbuffSize, &recvRegRecord)); NCCLCHECK(ncclRegLocalIsValid(recvRegRecord, &recvIsValid)); + if (recvIsValid) { + CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &baseRecvSize, (CUdeviceptr)recvbuff)); + if ((uint64_t)baseRecv + baseRecvSize < (uint64_t)recvbuff + recvbuffSize) { + // the virtual address is backed by multiple physical memory regions, just fall back to non-UB path + goto exit; + } + } } else { recvIsValid = true; } @@ -769,6 +788,7 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send if (sendIsValid && recvIsValid) NCCLCHECK(nvlsRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, sendRegRecord, recvRegRecord, outRegBufUsed, outRegBufSend, outRegBufRecv)); +exit: return ncclSuccess; } @@ -802,11 +822,19 @@ ncclResult_t ncclNvlsGraphRegisterBuffer( *outRegBufUsed = 0; if (sendbuff) { CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)sendbuff)); + if ((uint64_t)baseSend + baseSendSize < (uint64_t)sendbuff + sendbuffSize) { + // the virtual address is backed by multiple physical memory regions, just fall back to non-UB path + goto exit; + } NCCLCHECK(ncclCommGraphRegister(comm, baseSend, baseSendSize, (void**)&sendRegRecord)); } if (recvbuff) { CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &baseRecvSize, (CUdeviceptr)recvbuff)); + if ((uint64_t)baseRecv + baseRecvSize < (uint64_t)recvbuff + recvbuffSize) { + // the virtual address is backed by multiple physical memory regions, just fall back to non-UB path + goto exit; + } NCCLCHECK(ncclCommGraphRegister(comm, baseRecv, baseRecvSize, (void**)&recvRegRecord)); } @@ -835,82 +863,8 @@ ncclResult_t ncclNvlsGraphRegisterBuffer( if (recvbuff) NCCLCHECK(ncclCommGraphDeregister(comm, recvRegRecord)); } - return ncclSuccess; -} - -ncclResult_t ncclNvlsSymmetricInit(struct ncclComm* comm) { - ncclResult_t ret = ncclSuccess; - if (comm && comm->nvlsSupport) { - CUmulticastObjectProp mcprop = {}; - CUmemGenericAllocationHandle mcHandle; - char shareableHandle[NVLS_HANDLE_SIZE]; - CUmemAccessDesc accessDesc = {}; - - mcprop.numDevices = comm->localRanks; - mcprop.handleTypes = ncclCuMemHandleType; - mcprop.flags = 0; - mcprop.size = comm->baseStride; - - if (comm->localRank == 0) { - NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &mcHandle, shareableHandle), ret, fail); - NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); - } else { - NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail); - NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &mcHandle), ret, fail); - } - - CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->cudaDev), ret, fail); - CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)&comm->baseMCSymPtr, comm->baseStride, NCCL_MAX_PAGE_SIZE, 0, 0), ret, fail); - CUCHECKGOTO(cuMemMap((CUdeviceptr)comm->baseMCSymPtr, comm->baseStride, 0, mcHandle, 0), ret, fail); - accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - accessDesc.location.id = comm->cudaDev; - accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)comm->baseMCSymPtr, comm->baseStride, &accessDesc, 1), ret, fail); - comm->symMCHandle = mcHandle; - } exit: - return ret; -fail: - goto exit; -} - -ncclResult_t ncclNvlsSymmetricFinalize(struct ncclComm* comm) { - ncclResult_t ret = ncclSuccess; - if (comm && comm->nvlsSupport && comm->baseMCSymPtr) { - CUCHECKGOTO(cuMemUnmap((CUdeviceptr)comm->baseMCSymPtr, comm->baseStride), ret, fail); - CUCHECKGOTO(cuMemAddressFree((CUdeviceptr)comm->baseMCSymPtr, comm->baseStride), ret, fail); - CUCHECKGOTO(cuMemRelease(comm->symMCHandle), ret, fail); - } -exit: - return ret; -fail: - goto exit; -} - -ncclResult_t ncclNvlsSymmetricMap(struct ncclComm* comm, size_t offset, size_t ucsize, void* ucaddr) { - ncclResult_t ret = ncclSuccess; - assert((uintptr_t)ucaddr % NCCL_REC_PAGE_SIZE == 0 && ucsize % NCCL_REC_PAGE_SIZE == 0); - if (comm && comm->nvlsSupport && ucaddr && ucsize > 0) { - CUCHECKGOTO(cuMulticastBindAddr(comm->symMCHandle, offset, (CUdeviceptr)ucaddr, ucsize, 0), ret, fail); - INFO(NCCL_ALLOC, "NVLS symmetric alloc mc buffer ptr %p offset %ld UC addr %p UC size %ld symAllocHead %ld", comm->baseMCSymPtr + offset, offset, ucaddr, ucsize, comm->symAllocHead); - } - -exit: - return ret; -fail: - goto exit; -} - -ncclResult_t ncclNvlsSymmetricFree(struct ncclComm* comm, size_t ucsize, void* ucaddr) { - ncclResult_t ret = ncclSuccess; - if (comm && comm->nvlsSupport && ucaddr && ucsize > 0) { - size_t offset = (size_t)ucaddr - ((size_t)comm->baseUCSymPtr + comm->localRank * comm->baseStride); - CUCHECKGOTO(cuMulticastUnbind(comm->symMCHandle, comm->cudaDev, offset, ucsize), ret, fail); - } -exit: - return ret; -fail: - goto exit; + return ncclSuccess; } ncclResult_t ncclNvlsRegResourcesQuery(struct ncclComm* comm, struct ncclTaskColl* info, int* recChannels) { diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index d263dda3a..d9fd01da0 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -955,6 +955,8 @@ ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, si ncclResult_t ret = ncclSuccess; struct ncclReg *regRecord = NULL; bool isValid = false; + void *baseAddr = NULL; + size_t baseSize = 0; *regBufFlag = 0; *offsetOut = 0; @@ -962,8 +964,11 @@ ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, si if (comm && userbuff && buffSize > 0 && nPeers > 0) { NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, ®Record), ret, fail); NCCLCHECKGOTO(ncclRegLocalIsValid(regRecord, &isValid), ret, fail); - if (isValid) + if (isValid) { + CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail); + if ((uint64_t)baseAddr + baseSize < (uint64_t)userbuff + buffSize) goto exit; NCCLCHECKGOTO(ipcRegisterBuffer(comm, userbuff, buffSize, peerRanks, nPeers, type, regRecord, regBufFlag, offsetOut, peerRmtAddrsOut, NULL), ret, fail); + } } exit: @@ -1001,6 +1006,7 @@ ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, si *peerRmtAddrsOut = NULL; if (comm && userbuff && buffSize > 0 && nPeers > 0) { CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail); + if ((uint64_t)baseAddr + baseSize < (uint64_t)userbuff + buffSize) goto exit; NCCLCHECKGOTO(ncclCommGraphRegister(comm, baseAddr, baseSize, (void**)®Record), ret, fail); NCCLCHECKGOTO(ipcRegisterBuffer(comm, userbuff, buffSize, peerRanks, nPeers, type, regRecord, regBufFlag, offsetOut, peerRmtAddrsOut, &isLegacyIpc), ret, fail); if (*regBufFlag) { @@ -1118,88 +1124,6 @@ static ncclResult_t p2pProxyDeregister(struct ncclProxyConnection* connection, s goto exit; } -ncclResult_t ncclIpcSymmetricInit(struct ncclComm* comm) { - CUCHECK(cuMemAddressReserve((CUdeviceptr*)&comm->baseUCSymPtr, comm->baseStride * comm->localRanks, NCCL_MAX_PAGE_SIZE, 0, 0)); - return ncclSuccess; -} - -ncclResult_t ncclIpcSymmetricFinalize(struct ncclComm* comm) { - if (comm->baseUCSymPtr) { - CUCHECK(cuMemAddressFree((CUdeviceptr)comm->baseUCSymPtr, comm->baseStride * comm->localRanks)); - } - return ncclSuccess; -} - -ncclResult_t ncclIpcSymmetricMap(struct ncclComm* comm, size_t offset, size_t size, CUmemGenericAllocationHandle memHandle, void** symPtr) { - ncclResult_t ret = ncclSuccess; - CUmemGenericAllocationHandle impHandle; - int impFd = -1; - ncclCuDesc* desc = NULL; - CUmemAccessDesc accessDesc = {}; - - assert(offset % NCCL_REC_PAGE_SIZE == 0 && size % NCCL_REC_PAGE_SIZE == 0); - NCCLCHECKGOTO(ncclCalloc(&desc, comm->localRanks), ret, fail); - if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { - memcpy(&desc[comm->localRank].data, &memHandle, sizeof(CUmemGenericAllocationHandle)); - } else { - CUCHECKGOTO(cuMemExportToShareableHandle(&desc[comm->localRank].handle, memHandle, ncclCuMemHandleType, 0), ret, fail); - } - - NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, desc, sizeof(ncclCuDesc)), ret, fail); - - // start mapping - accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - accessDesc.location.id = comm->cudaDev; - accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - for (int r = 0; r < comm->localRanks; ++r) { - CUdeviceptr maddr; - if (r == comm->localRank) { - impHandle = memHandle; - } else { - if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) { - impFd = -1; - NCCLCHECKGOTO(ncclProxyClientGetFdBlocking(comm, comm->localRankToRank[r], &desc[r].data, &impFd), ret, fail); - CUCHECKGOTO(cuMemImportFromShareableHandle(&impHandle, (void*)(uintptr_t)impFd, ncclCuMemHandleType), ret, fail); - SYSCHECKGOTO(close(impFd), "close", ret, fail); - } else { - CUCHECKGOTO(cuMemImportFromShareableHandle(&impHandle, (void*)&desc[r].handle, ncclCuMemHandleType), ret, fail); - } - } - maddr = (CUdeviceptr)(comm->baseUCSymPtr + (size_t)r * comm->baseStride + offset); - CUCHECKGOTO(cuMemMap(maddr, size, 0, impHandle, 0), ret, fail); - CUCHECKGOTO(cuMemSetAccess(maddr, size, &accessDesc, 1), ret, fail); - - if (r == comm->localRank) { - *symPtr = (void*)maddr; - } else { - CUCHECKGOTO(cuMemRelease(impHandle), ret, fail); - } - } - - INFO(NCCL_ALLOC, "IPC symmetric alloc buffer %p offset %ld size %ld symAllocHead %ld", *symPtr, offset, size, comm->symAllocHead); - -exit: - free(desc); - return ret; -fail: - goto exit; -} - -ncclResult_t ncclIpcSymmetricFree(struct ncclComm* comm, size_t size, void* symPtr) { - ncclResult_t ret = ncclSuccess; - if (comm && symPtr && size > 0) { - size_t offset = (size_t)symPtr - ((size_t)comm->baseUCSymPtr + comm->localRank * comm->baseStride); - for (int r = 0; r < comm->localRanks; ++r) { - CUdeviceptr peerAddr = (CUdeviceptr)(comm->baseUCSymPtr + r * comm->baseStride + offset); - CUCHECKGOTO(cuMemUnmap(peerAddr, size), ret, fail); - } - } -exit: - return ret; -fail: - goto exit; -} - struct ncclTransport p2pTransport = { "P2P", p2pCanConnect, diff --git a/src/transport/profiler.cc b/src/transport/profiler.cc index 6e7b33c16..354fa57bb 100644 --- a/src/transport/profiler.cc +++ b/src/transport/profiler.cc @@ -10,7 +10,7 @@ static ncclResult_t profilerProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { connection->proxyAppendPtr = &connection->proxyAppend; - connection->shared = 1; + connection->shared = 0; return ncclSuccess; } From e11d7f77c126561e35909407a5bd1461a437322b Mon Sep 17 00:00:00 2001 From: Mark Santesson Date: Wed, 10 Sep 2025 15:38:04 -0700 Subject: [PATCH 18/21] Add root CMakeLists.txt file --- CMakeLists.txt | 166 +++++++++++++++++++++++++++++++++++++++++ Makefile | 9 +-- ext-net/CMakeLists.txt | 4 + 3 files changed, 174 insertions(+), 5 deletions(-) create mode 100644 CMakeLists.txt create mode 100644 ext-net/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..1941cdafe --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,166 @@ +cmake_minimum_required(VERSION 3.25) # ipp6 is using 3.28 + +# Version information +# Read makefiles/version.mk file +file(READ ${CMAKE_SOURCE_DIR}/makefiles/version.mk VERSION_CONTENT) +string(REGEX REPLACE ".*NCCL_MAJOR[ ]*:=[ ]*([0-9]+).*" "\\1" NCCL_MAJOR "${VERSION_CONTENT}") +string(REGEX REPLACE ".*NCCL_MINOR[ ]*:=[ ]*([0-9]+).*" "\\1" NCCL_MINOR "${VERSION_CONTENT}") +string(REGEX REPLACE ".*NCCL_PATCH[ ]*:=[ ]*([0-9]+).*" "\\1" NCCL_PATCH "${VERSION_CONTENT}") +string(REGEX REPLACE ".*NCCL_SUFFIX[ ]*:=[ ]*([a-zA-Z0-9]*).*" "\\1" NCCL_SUFFIX "${VERSION_CONTENT}") +string(REGEX REPLACE ".*PKG_REVISION[ ]*:=[ ]*([0-9]+).*" "\\1" PKG_REVISION "${VERSION_CONTENT}") +math(EXPR NCCL_VERSION_CODE "(${NCCL_MAJOR} * 10000) + (${NCCL_MINOR} * 100) + ${NCCL_PATCH}") + +# Make version information available to C++ source files +add_compile_definitions( + NCCL_USE_CMAKE + NCCL_MAJOR=${NCCL_MAJOR} + NCCL_MINOR=${NCCL_MINOR} + NCCL_PATCH=${NCCL_PATCH} + NCCL_VERSION_CODE=${NCCL_VERSION_CODE} +) + +set(ENV{NCCL_USE_CMAKE} "1") + +project(NCCL VERSION ${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH} + LANGUAGES CUDA CXX C) + +# Make CMAKE_BUILD_TYPE to release by default if not set +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release") +endif() + +option(VERBOSE "Enable verbose output" OFF) +option(KEEP "Keep intermediate files" OFF) +option(DEBUG "Enable debug build" OFF) +option(ASAN "Enable Address Sanitizer" OFF) +option(UBSAN "Enable Undefined Behavior Sanitizer" OFF) +option(TRACE "Enable tracing" OFF) +option(WERROR "Treat warnings as errors" OFF) +option(PROFAPI "Enable profiling API" ON) +option(NVTX "Enable NVTX" ON) +option(RDMA_CORE "Enable RDMA core" OFF) +option(NET_PROFILER "Enable network profiler" OFF) +option(MLX5DV "Enable MLX5DV" OFF) +option(MAX_EXT_NET_PLUGINS "Maximum external network plugins" 0) + +find_package(CUDAToolkit REQUIRED) + +# CUDA version detection +string(REGEX MATCH "([0-9]+\\.[0-9]+)" CUDA_VERSION "${CUDAToolkit_VERSION}") + +# Extract major and minor version numbers +string(REGEX MATCH "([0-9]+)" CUDA_MAJOR "${CUDA_VERSION}") +string(REGEX MATCH "([0-9]+)$" CUDA_MINOR "${CUDA_VERSION}") +string(REGEX REPLACE ".*\\.([0-9]+)$" "\\1" CUDA_MINOR "${CUDA_VERSION}") + +# Add CUDA version definitions after find_package +add_compile_definitions( + CUDA_MAJOR=${CUDA_MAJOR} + CUDA_MINOR=${CUDA_MINOR} +) + +# CUDA architecture flags +if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "") + message(STATUS "CMAKE_CUDA_ARCHITECTURES not defined or empty, setting default values based on CUDA version") + + if(${CUDA_MAJOR} LESS 9) + set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61") + elseif(${CUDA_MAJOR} EQUAL 9) + set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61;70") + elseif(${CUDA_MAJOR} EQUAL 10) + set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61;70") + elseif(${CUDA_MAJOR} EQUAL 11) + if(${CUDA_MINOR} LESS 8) + set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61;70;80") + else() + set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61;70;80;90") + endif() + elseif(${CUDA_MAJOR} EQUAL 12) + if(${CUDA_MINOR} LESS 8) + set(CMAKE_CUDA_ARCHITECTURES "50;60;61;70;80;90") + else() + set(CMAKE_CUDA_ARCHITECTURES "50;60;61;70;80;90;100;120") + endif() + elseif(${CUDA_MAJOR} EQUAL 13) + set(CMAKE_CUDA_ARCHITECTURES "50;60;61;70;80;90;100;110;120") + else() + # For future CUDA versions, include all architectures up to the latest known + set(CMAKE_CUDA_ARCHITECTURES "50;60;61;70;80;90;100;110;120") + endif() +endif() +message(STATUS "Using CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}") + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -Wvla -g") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -fPIC") + +# Sanitizer options +if(ASAN) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address -static-libasan") +endif() + +if(UBSAN) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=undefined -static-libubsan") +endif() + +# Additional options +if(TRACE) + add_definitions(-DENABLE_TRACE) +endif() + +if(NOT NVTX) + add_definitions(-DNVTX_DISABLE) +endif() + +if(WERROR) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror") +endif() + +if(PROFAPI) + add_definitions(-DPROFAPI) +endif() + +set(EXTRA_LIBS) + +# RDMA and MLX5DV are Linux-specific features +if(RDMA_CORE) + add_definitions(-DNCCL_BUILD_RDMA_CORE=1) + find_library(VERBS_LIBRARY NAMES verbs) + if(VERBS_LIBRARY) + list(APPEND EXTRA_LIBS ${VERBS_LIBRARY}) + endif() +endif() + +if(MLX5DV) + add_definitions(-DNCCL_BUILD_MLX5DV=1) + find_library(MLX5_LIBRARY NAMES mlx5) + if(MLX5_LIBRARY) + list(APPEND EXTRA_LIBS ${MLX5_LIBRARY}) + endif() +endif() + +if(NET_PROFILER) + add_definitions(-DNCCL_ENABLE_NET_PROFILING=1) +endif() + +if(MAX_EXT_NET_PLUGINS GREATER 0) + add_definitions(-DNCCL_NET_MAX_PLUGINS=${MAX_EXT_NET_PLUGINS}) +endif() + +# Library dependencies +find_library(RT_LIBRARY NAMES rt) +if(RT_LIBRARY) + list(APPEND EXTRA_LIBS ${RT_LIBRARY}) +endif() + +# Debug/Release specific flags +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -O0") +set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS} -O0 -G -g") +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3") +set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS} -O3") + +add_subdirectory(ext-net) +add_subdirectory(ext-profiler/example) +add_subdirectory(ext-tuner/example) +add_subdirectory(src) diff --git a/Makefile b/Makefile index caed3d42a..458a50741 100644 --- a/Makefile +++ b/Makefile @@ -3,15 +3,14 @@ # # See LICENSE.txt for license information # -.PHONY : all clean +.PHONY: all clean -default : src.build -install : src.install +default: src.build +install: src.install BUILDDIR ?= $(abspath ./build) ABSBUILDDIR := $(abspath $(BUILDDIR)) TARGETS := src pkg clean: ${TARGETS:%=%.clean} -test.build: src.build LICENSE_FILES := LICENSE.txt LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%) lic: $(LICENSE_TARGETS) @@ -19,7 +18,7 @@ lic: $(LICENSE_TARGETS) ${BUILDDIR}/%.txt: %.txt @printf "Copying %-35s > %s\n" $< $@ mkdir -p ${BUILDDIR} - cp $< $@ + install -m 644 $< $@ src.%: ${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR} diff --git a/ext-net/CMakeLists.txt b/ext-net/CMakeLists.txt new file mode 100644 index 000000000..a2cc38df2 --- /dev/null +++ b/ext-net/CMakeLists.txt @@ -0,0 +1,4 @@ +# Since all the plugins generate binary with the same name, build only one of them +add_subdirectory(example) +# add_subdirectory(ib_sharp) +# add_subdirectory(mock) From 8d26308e6aba7f1667b24a861b5dc73f0f2e1f40 Mon Sep 17 00:00:00 2001 From: Mark Santesson Date: Wed, 24 Sep 2025 12:31:42 -0700 Subject: [PATCH 19/21] Add examples directory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The NCCL examples directory provides users and developers with practical code samples that highlight NCCL’s core features. It covers basic operations like communicator initialization, point-to-point communication, and collective operations, as well as advanced features such as User Buffer (UB), symmetric memory, and the device API. --- Makefile | 6 +- .../Makefile | 60 ++++ .../README.md | 177 ++++++++++ .../main.cc | 256 ++++++++++++++ .../02_one_device_per_pthread/Makefile | 68 ++++ .../02_one_device_per_pthread/README.md | 158 +++++++++ .../02_one_device_per_pthread/main.cc | 196 ++++++++++ .../03_one_device_per_process_mpi/Makefile | 66 ++++ .../03_one_device_per_process_mpi/README.md | 196 ++++++++++ .../03_one_device_per_process_mpi/main.cc | 249 +++++++++++++ examples/01_communicators/Makefile | 59 ++++ examples/01_communicators/README.md | 107 ++++++ .../01_ring_pattern/Makefile | 57 +++ .../01_ring_pattern/README.md | 149 ++++++++ .../02_point_to_point/01_ring_pattern/main.cc | 273 ++++++++++++++ examples/02_point_to_point/Makefile | 47 +++ examples/02_point_to_point/README.md | 65 ++++ examples/03_collectives/01_allreduce/Makefile | 57 +++ .../03_collectives/01_allreduce/README.md | 141 ++++++++ examples/03_collectives/01_allreduce/main.cc | 201 +++++++++++ examples/03_collectives/Makefile | 47 +++ examples/03_collectives/README.md | 68 ++++ .../01_allreduce/Makefile | 77 ++++ .../01_allreduce/README.md | 163 +++++++++ .../01_allreduce/main.cc | 214 +++++++++++ examples/04_user_buffer_registration/Makefile | 47 +++ .../04_user_buffer_registration/README.md | 73 ++++ .../05_symmetric_memory/01_allreduce/Makefile | 77 ++++ .../01_allreduce/README.md | 165 +++++++++ .../05_symmetric_memory/01_allreduce/main.cc | 220 ++++++++++++ examples/05_symmetric_memory/Makefile | 47 +++ examples/05_symmetric_memory/README.md | 72 ++++ examples/06_device_api/01_allreduce/Makefile | 81 +++++ examples/06_device_api/01_allreduce/README.md | 218 ++++++++++++ examples/06_device_api/01_allreduce/main.cu | 251 +++++++++++++ examples/06_device_api/Makefile | 47 +++ examples/06_device_api/README.md | 70 ++++ examples/Makefile | 54 +++ examples/README.md | 146 ++++++++ examples/common/README.md | 36 ++ examples/common/include/mpi_utils.h | 23 ++ examples/common/include/nccl_utils.h | 40 +++ examples/common/include/utils.h | 55 +++ examples/common/src/utils.cc | 334 ++++++++++++++++++ makefiles/examples.mk | 31 ++ 45 files changed, 5243 insertions(+), 1 deletion(-) create mode 100644 examples/01_communicators/01_multiple_devices_single_process/Makefile create mode 100644 examples/01_communicators/01_multiple_devices_single_process/README.md create mode 100644 examples/01_communicators/01_multiple_devices_single_process/main.cc create mode 100644 examples/01_communicators/02_one_device_per_pthread/Makefile create mode 100644 examples/01_communicators/02_one_device_per_pthread/README.md create mode 100644 examples/01_communicators/02_one_device_per_pthread/main.cc create mode 100644 examples/01_communicators/03_one_device_per_process_mpi/Makefile create mode 100644 examples/01_communicators/03_one_device_per_process_mpi/README.md create mode 100644 examples/01_communicators/03_one_device_per_process_mpi/main.cc create mode 100644 examples/01_communicators/Makefile create mode 100644 examples/01_communicators/README.md create mode 100644 examples/02_point_to_point/01_ring_pattern/Makefile create mode 100644 examples/02_point_to_point/01_ring_pattern/README.md create mode 100644 examples/02_point_to_point/01_ring_pattern/main.cc create mode 100644 examples/02_point_to_point/Makefile create mode 100644 examples/02_point_to_point/README.md create mode 100644 examples/03_collectives/01_allreduce/Makefile create mode 100644 examples/03_collectives/01_allreduce/README.md create mode 100644 examples/03_collectives/01_allreduce/main.cc create mode 100644 examples/03_collectives/Makefile create mode 100644 examples/03_collectives/README.md create mode 100644 examples/04_user_buffer_registration/01_allreduce/Makefile create mode 100644 examples/04_user_buffer_registration/01_allreduce/README.md create mode 100644 examples/04_user_buffer_registration/01_allreduce/main.cc create mode 100644 examples/04_user_buffer_registration/Makefile create mode 100644 examples/04_user_buffer_registration/README.md create mode 100644 examples/05_symmetric_memory/01_allreduce/Makefile create mode 100644 examples/05_symmetric_memory/01_allreduce/README.md create mode 100644 examples/05_symmetric_memory/01_allreduce/main.cc create mode 100644 examples/05_symmetric_memory/Makefile create mode 100644 examples/05_symmetric_memory/README.md create mode 100644 examples/06_device_api/01_allreduce/Makefile create mode 100644 examples/06_device_api/01_allreduce/README.md create mode 100644 examples/06_device_api/01_allreduce/main.cu create mode 100644 examples/06_device_api/Makefile create mode 100644 examples/06_device_api/README.md create mode 100644 examples/Makefile create mode 100644 examples/README.md create mode 100644 examples/common/README.md create mode 100644 examples/common/include/mpi_utils.h create mode 100644 examples/common/include/nccl_utils.h create mode 100644 examples/common/include/utils.h create mode 100644 examples/common/src/utils.cc create mode 100644 makefiles/examples.mk diff --git a/Makefile b/Makefile index 458a50741..2b1a57c5a 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -11,6 +11,7 @@ BUILDDIR ?= $(abspath ./build) ABSBUILDDIR := $(abspath $(BUILDDIR)) TARGETS := src pkg clean: ${TARGETS:%=%.clean} +examples.build: src.build LICENSE_FILES := LICENSE.txt LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%) lic: $(LICENSE_TARGETS) @@ -23,6 +24,9 @@ ${BUILDDIR}/%.txt: %.txt src.%: ${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR} +examples: src.build + ${MAKE} -C examples NCCL_HOME=${ABSBUILDDIR} + pkg.%: ${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR} diff --git a/examples/01_communicators/01_multiple_devices_single_process/Makefile b/examples/01_communicators/01_multiple_devices_single_process/Makefile new file mode 100644 index 000000000..edea518fc --- /dev/null +++ b/examples/01_communicators/01_multiple_devices_single_process/Makefile @@ -0,0 +1,60 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +# Include common build rules +include ../../../makefiles/common.mk +include ../../../makefiles/examples.mk + +# Target executable +TARGET = multiple_devices_single_process + +# Source files +SOURCES = main.cc +OBJECTS = $(SOURCES:.cc=.o) + +# Default target +all: $(TARGET) + +# Build executable +$(TARGET): $(OBJECTS) + $(CXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -o $@ + @echo "Built target $@" + +# Compile source files +%.o: %.cc + $(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@ + +# Test target +test: $(TARGET) + @echo "Testing $(TARGET)..." + @echo "Running with all available GPUs" + ./$(TARGET) + +# Clean build artifacts +clean: + rm -f $(OBJECTS) $(TARGET) + +# Install target +install: $(TARGET) + @mkdir -p $(PREFIX)/bin + cp $(TARGET) $(PREFIX)/bin/ + +# Help +help: + @echo "NCCL Example: Multiple Devices Single Process" + @echo "==============================================" + @echo "" + @echo "This example shows how to use ncclCommInitAll to create" + @echo "communicators for multiple GPUs in a single process." + @echo "" + @echo "Targets:" + @echo " all - Build the example (default)" + @echo " test - Build and run test with all GPUs" + @echo " clean - Remove build artifacts" + @echo " install - Install to PREFIX/bin (default: /usr/local/bin)" + @echo " help - Show this help" + +.PHONY: all test clean install help diff --git a/examples/01_communicators/01_multiple_devices_single_process/README.md b/examples/01_communicators/01_multiple_devices_single_process/README.md new file mode 100644 index 000000000..9b4fe0f4a --- /dev/null +++ b/examples/01_communicators/01_multiple_devices_single_process/README.md @@ -0,0 +1,177 @@ + + +# NCCL Example: Multiple Devices Single Process + +This example demonstrates how to use `ncclCommInitAll` to create NCCL +communicators for multiple GPUs within a single process, without requiring MPI +or threading. + +## Overview + +The `ncclCommInitAll` function provides a simplified way to initialize NCCL +communicators when: +- All GPUs are managed by a single process +- Running on a single node +- No multi-process coordination is needed + +This approach is ideal for single-node multi-GPU applications where simplicity +is preferred over the flexibility of multi-process setups. + +## What This Example Does + +1. **Device Detection**: + - Queries available CUDA devices + - Lists device properties for each GPU + +2. **Communicator Creation**: + - Uses `ncclCommInitAll` to create all communicators in one call + - Automatically assigns NCCL ranks 0 through n-1 + - No NCCL unique ID distribution needed + +3. **Verification**: + - Displays communicator information for each GPU + - Shows rank assignments and device mappings + - Confirms successful initialization + +4. **Cleanup**: + - Properly destroys communicators and streams + - Demonstrates correct resource management + +## Building and Running + +### Build +```shell +make [NCCL_HOME=] [CUDA_HOME=] +``` + +### Run with all available GPUs +```shell +./multiple_devices_single_process +``` + +### Run with specific GPUs +```shell +# Use only GPUs 0 and 1 +CUDA_VISIBLE_DEVICES=0,1 ./multiple_devices_single_process +``` + +### Run with NCCL debug output +```shell +NCCL_DEBUG=INFO ./multiple_devices_single_process +``` + +## Code Walk-through + +### Key Function: ncclCommInitAll +For single-node collective examples we use `ncclCommInitAll` as it creates a clique of communicators in one call. +```c +int num_gpus; // num_gpus is set by querying the CUDA devices +ncclComm_t* comms; +int* devices; // devices needs to be populated with CUDA devices used + +// Create communicators for all devices in one call +NCCLCHECK(ncclCommInitAll(comms, num_gpus, devices)); +``` + +This single function call: +- Creates `num_gpus` communicators +- Assigns ranks 0 to (num_gpus-1) +- Sets up internal communication paths +- No unique ID needed + +### Comparison with ncclCommInitRank +`ncclCommInitAll` is a convenience function and has the same functionality as: +```c +ncclUniqueId id; + +ncclGetUniqueId(&id); + +ncclGroupStart(); +for(int i = 0; i < num_gpus; i++) { + cudaSetDevice(i); + ncclCommInitRank(comms[i], num_gpus, id, devices[i]); +} +ncclGroupEnd(); +``` + +## Expected Output + +``` +Found 4 CUDA device(s) available + +Available GPU devices: + GPU 0: NVIDIA A100-SXM4-40GB (CUDA Device 0) + Compute Capability: 8.0 + Memory: 40.0 GB + GPU 1: NVIDIA A100-SXM4-40GB (CUDA Device 1) + Compute Capability: 8.0 + Memory: 40.0 GB + GPU 2: NVIDIA A100-SXM4-40GB (CUDA Device 2) + Compute Capability: 8.0 + Memory: 40.0 GB + GPU 3: NVIDIA A100-SXM4-40GB (CUDA Device 3) + Compute Capability: 8.0 + Memory: 40.0 GB +Using ncclCommInitAll() to create all communicators simultaneously +All 4 NCCL communicators initialized successfully + +Communicator Details: + Communicator 0: Rank 0/4 on CUDA device 0 + Communicator 1: Rank 1/4 on CUDA device 1 + Communicator 2: Rank 2/4 on CUDA device 2 + Communicator 3: Rank 3/4 on CUDA device 3 +All communicators have the expected size of 4 + +Synchronizing all CUDA streams... +All streams synchronized +Destroying NCCL communicators... +All NCCL communicators destroyed +Destroying CUDA streams... +All CUDA streams destroyed + +============================================================= +SUCCESS: Multiple devices single process example completed! +============================================================= +``` + +## When to Use ncclCommInitAll + +### Ideal Use Cases +- **Single-node workloads**: All GPUs on one machine +- **Simple applications**: No multi-process complexity needed +- **Testing/Development**: Quick setup for experiments + +### When NOT to Use +- **Multi-node clusters**: Need MPI for cross-node communication +- **Process isolation**: When GPUs should be in separate processes + +## Performance Considerations + +- **Advantages**: + - Lower overhead (no inter process communication) + - Simpler memory management + - Direct access to all GPUs + +- **Disadvantages**: + - Limited by single process resources + - Cannot scale beyond one node + +## Common Issues and Solutions + +1. **Not all GPUs visible**: + - Check `CUDA_VISIBLE_DEVICES` + - Ensure user has permissions for all GPUs + - Verify no other process is using GPUs exclusively + +2. **Out of memory**: + - Single process must handle memory for all GPUs + - Consider using multiple processes if memory limited + +## Next Steps + +After understanding this example: +1. Try the collective operation examples using `ncclCommInitAll` +2. Compare performance with MPI-based multi-process approach +3. Experiment with different GPU combinations diff --git a/examples/01_communicators/01_multiple_devices_single_process/main.cc b/examples/01_communicators/01_multiple_devices_single_process/main.cc new file mode 100644 index 000000000..19b4edc7c --- /dev/null +++ b/examples/01_communicators/01_multiple_devices_single_process/main.cc @@ -0,0 +1,256 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "nccl.h" +#include +#include +#include + +/* + * NCCL Example: Multiple Devices Single Process + * ============================================= + * + * PURPOSE: + * This example demonstrates how to initialize NCCL communicators for multiple + * GPUs within a single process. This is the simplest NCCL setup and is ideal + * for learning NCCL basics or for applications that want to use multiple GPUs + * without the complexity of multi-process coordination. + * + * LEARNING OBJECTIVES: + * - Learn how to use ncclCommInitAll() for simple multi-GPU setups + * - See proper NCCL communicator lifecycle management + * - Understand GPU device management in NCCL applications + * - Learn proper resource cleanup patterns + * + * HOW IT WORKS: + * 1. Detect all available CUDA devices + * 2. Create communicators for all devices using ncclCommInitAll() + * 3. Verify communicator properties (rank, size, device assignment) + * 4. Clean up all resources properly + * + * KEY CONCEPTS: + * - ncclCommInitAll(): Creates multiple communicators in a single call + * - Single-process topology: All GPUs managed by one process + * - Device management: Setting active CUDA device for operations + * - Stream management: Each GPU gets its own CUDA stream + * + * WHEN TO USE THIS PATTERN: + * - Learning NCCL fundamentals + * - Single-node, multi-GPU applications + * - Applications that don't need multi-node scaling + * - Prototyping and testing NCCL functionality + * + * USAGE EXAMPLES: + * ./multiple_devices_single_process # Use all available GPUs + * + * EXPECTED OUTPUT: + * - Detection of all available GPUs + * - Successful communicator initialization + * - Display of rank/size information for each GPU + * - Clean resource cleanup confirmation + */ + +// Enhanced error checking macro for NCCL operations +// Provides detailed error information including the failed operation + +#define NCCLCHECK(cmd) \ + do { \ + ncclResult_t res = cmd; \ + if (res != ncclSuccess) { \ + fprintf(stderr, "Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, \ + ncclGetErrorString(res)); \ + fprintf(stderr, "Failed NCCL operation: %s\n", #cmd); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define CUDACHECK(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + fprintf(stderr, "Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \ + cudaGetErrorString(err)); \ + fprintf(stderr, "Failed CUDA operation: %s\n", #cmd); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +// ============================================================================= +// MAIN FUNCTION - NCCL Communicator Lifecycle Example +// ============================================================================= + +int main(int argc, char *argv[]) { + // Variables for managing multiple GPU communicators + int num_gpus; // Number of available CUDA devices + ncclComm_t *comms = NULL; // Array of NCCL communicators (one per GPU) + cudaStream_t *streams = NULL; // Array of CUDA streams (one per GPU) + int *devices = NULL; // Array of device IDs to use + + // Discover how many CUDA devices are available + // This determines how many NCCL communicators we'll create + CUDACHECK(cudaGetDeviceCount(&num_gpus)); + + if (num_gpus == 0) { + fprintf(stderr, "ERROR: No CUDA devices found on this system\n"); + fprintf( + stderr, + "Please ensure CUDA is properly installed and GPUs are available\n"); + return 1; + } + + printf("Found %d CUDA device(s) available\n\n", num_gpus); + + // ========================================================================= + // STEP 1: Prepare Device Information and Memory Allocation + // ========================================================================= + + // Allocate arrays to hold our per-device resources + // We need one communicator, stream, and device ID per GPU + devices = (int *)malloc(num_gpus * sizeof(int)); + comms = (ncclComm_t *)malloc(num_gpus * sizeof(ncclComm_t)); + streams = (cudaStream_t *)malloc(num_gpus * sizeof(cudaStream_t)); + + if (!devices || !comms || !streams) { + fprintf(stderr, "ERROR: Failed to allocate memory for device arrays\n"); + return 1; + } + + // Create device list and display device information + // By default, we use all available devices (0, 1, 2, ...) + printf("Available GPU devices:\n"); + for (int i = 0; i < num_gpus; i++) { + devices[i] = i; // Use device i for communicator i + + // Query device properties for informational display + cudaDeviceProp prop; + CUDACHECK(cudaGetDeviceProperties(&prop, devices[i])); + printf(" GPU %d: %s (CUDA Device %d)\n", i, prop.name, devices[i]); + printf(" Compute Capability: %d.%d\n", prop.major, prop.minor); + printf(" Memory: %.1f GB\n", + prop.totalGlobalMem / (1024.0 * 1024.0 * 1024.0)); + } + + // Create a CUDA stream for each GPU + // Each GPU needs its own stream for optimal performance + for (int i = 0; i < num_gpus; i++) { + // Set the active CUDA device before creating resources on it + // This ensures the stream is created on the correct GPU + CUDACHECK(cudaSetDevice(devices[i])); + CUDACHECK(cudaStreamCreate(&streams[i])); + } + + // ========================================================================= + // STEP 2 : Initialize NCCL Communicators + // ========================================================================= + + printf("Using ncclCommInitAll() to create all communicators " + "simultaneously\n"); + + // ncclCommInitAll() creates all communicators at once and handles the + // coordination internally + // + // Parameters: + // - comms: Array to store the created communicators + // - num_gpus: Number of communicators to create + // - devices: Array of CUDA device IDs to use + // + // After this call: + // - comms[0] will be the communicator for devices[0] with rank 0 + // - comms[1] will be the communicator for devices[1] with rank 1 + // - ... and so on + // + // All communicators will have the same 'size' (total number of + // participants) + NCCLCHECK(ncclCommInitAll(comms, num_gpus, devices)); + printf("All %d NCCL communicators initialized successfully\n\n", num_gpus); + + // ========================================================================= + // STEP 3: Create CUDA Streams and Verify Communicator Properties + // ========================================================================= + + printf("Communicator Details:\n"); + + bool sizes_match = true; + for (int i = 0; i < num_gpus; i++) { + + // Query the communicator to verify it was set up correctly + // These calls validate that NCCL properly assigned ranks and devices + int rank, size, device; + // Get this communicator's rank + NCCLCHECK(ncclCommUserRank(comms[i], &rank)); + // Get total number of participants + NCCLCHECK(ncclCommCount(comms[i], &size)); + // Get assigned CUDA device + NCCLCHECK(ncclCommCuDevice(comms[i], &device)); + + printf(" Communicator %d: Rank %d/%d on CUDA device %d", i, rank, size, + device); + + // Verify the assignment is correct + if (rank != i) { + printf(" [WARNING: Expected rank %d]", i); + } + if (device != devices[i]) { + printf(" [WARNING: Expected device %d]", devices[i]); + } + printf("\n"); + + // Verify that all communicators have the expected size + if (size != num_gpus) { + printf("WARNING: Communicator %d has size %d, expected %d\n", i, size, num_gpus); + sizes_match = false; + } + } + if (sizes_match) + printf("All communicators have the expected size of %d\n", num_gpus); + + printf("\n"); + + // ========================================================================= + // STEP 4: Cleanup and Resource Management + // ========================================================================= + + // IMPORTANT: Proper cleanup is critical for NCCL applications + // Resources must be cleaned up in the correct order to avoid issues + + // First, synchronize all streams to ensure no operations are in flight + // This prevents destroying resources while they're still being used + printf("Synchronizing all CUDA streams...\n"); + for (int i = 0; i < num_gpus; i++) { + CUDACHECK(cudaSetDevice(devices[i])); + CUDACHECK(cudaStreamSynchronize(streams[i])); + } + printf("All streams synchronized\n"); + + // Next, destroy NCCL communicators first + // This must be done before destroying CUDA resources they depend on + printf("Destroying NCCL communicators...\n"); + for (int i = 0; i < num_gpus; i++) { + NCCLCHECK(ncclCommDestroy(comms[i])); + } + printf("All NCCL communicators destroyed\n"); + + // Finally, destroy CUDA streams + // This is safe now that the communicators are gone + printf("Destroying CUDA streams...\n"); + for (int i = 0; i < num_gpus; i++) { + CUDACHECK(cudaSetDevice(devices[i])); + CUDACHECK(cudaStreamDestroy(streams[i])); + } + printf("All CUDA streams destroyed\n"); + + // Free host memory allocations + free(devices); + free(comms); + free(streams); + + printf("\n=============================================================\n"); + printf("SUCCESS: Multiple devices single process example completed!\n"); + printf("=============================================================\n\n"); + + return 0; +} diff --git a/examples/01_communicators/02_one_device_per_pthread/Makefile b/examples/01_communicators/02_one_device_per_pthread/Makefile new file mode 100644 index 000000000..f7f825a0b --- /dev/null +++ b/examples/01_communicators/02_one_device_per_pthread/Makefile @@ -0,0 +1,68 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +# Include common build rules +include ../../../makefiles/common.mk +include ../../../makefiles/examples.mk + +# Target executable +TARGET = one_device_per_pthread + +# Add pthread support +LDFLAGS += -lpthread + +# Source files +SOURCES = main.cc +OBJECTS = $(SOURCES:.cc=.o) + +# Default target +all: $(TARGET) + +# Build executable +$(TARGET): $(OBJECTS) + $(CXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -o $@ + @echo "Built target $@" + +# Compile source files +%.o: %.cc + $(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@ + +# Test target +test: $(TARGET) + @echo "Testing $(TARGET)..." + @echo "Running with default thread count (number of GPUs)" + ./$(TARGET) + @echo "" + @if [ "$$(nvidia-smi -L | wc -l)" -ge 2 ]; then \ + echo "Running with 2 threads"; \ + NTHREADS=2 ./$(TARGET); \ + fi + +# Clean build artifacts +clean: + rm -f $(OBJECTS) $(TARGET) + +# Install target +install: $(TARGET) + @mkdir -p $(PREFIX)/bin + cp $(TARGET) $(PREFIX)/bin/ + +# Help +help: + @echo "NCCL Example: One Device per Thread (pthread)" + @echo "============================================" + @echo "" + @echo "This example shows how to use ncclCommInitRank to create" + @echo "communicators for multiple GPUs using pthreads." + @echo "" + @echo "Targets:" + @echo " all - Build the example (default)" + @echo " test - Build and run basic tests" + @echo " clean - Remove build artifacts" + @echo " install - Install to PREFIX/bin (default: /usr/local/bin)" + @echo " help - Show this help" + +.PHONY: all test clean install help diff --git a/examples/01_communicators/02_one_device_per_pthread/README.md b/examples/01_communicators/02_one_device_per_pthread/README.md new file mode 100644 index 000000000..170f235b5 --- /dev/null +++ b/examples/01_communicators/02_one_device_per_pthread/README.md @@ -0,0 +1,158 @@ + + +# NCCL Example: One Device per Thread (pthread) + +This example demonstrates NCCL communicator lifecycle management using pthreads, with one GPU per +thread. + +## Overview + +This example shows how to use NCCL in a multi-threaded environment where each pthread manages one +GPU device. It demonstrates the proper initialization and cleanup sequence for NCCL communicators +within threads. + +## What This Example Does + +1. **Thread Creation**: + - Creates one pthread per available GPU or `NTHREADS` if set + - Each thread manages its own CUDA device context + +2. **Communicator Creation**: + - Uses `ncclCommInitRank` with unique ID across threads + - Each thread initializes its own communicator + - Demonstrates thread-safe NCCL initialization + +3. **Verification**: + - Queries communicator properties (rank, size, device) + - Confirms successful initialization across all threads + +4. **Cleanup**: + - Proper resource cleanup order within each thread + - Demonstrates correct NCCL and CUDA resource management + +## Building and Running + +### Build +```shell +make [NCCL_HOME=] [CUDA_HOME=] +``` + +### Run with specific thread count (number of GPUs) +```shell +[NTHREADS=n] ./one_device_per_pthread +``` + +### Run with NCCL debug output +```shell +NCCL_DEBUG=INFO ./one_device_per_pthread +``` + +## Code Walk-through + +### Key Function: ncclCommInitRank in threads +```c +// Each thread creates it's own copy of struct `threadData_t`. +typedef struct { + int thread_id; // thread_id is set when thread is created + int num_gpus; // num_gpus is set by querying the CUDA devices + ncclUniqueId commId; // commId is set by ncclGetUniqueId + ncclComm_t* comms; +} threadData_t; +threadData_t* data; + +// Each thread initializes its own communicator +NCCLCHECK(ncclCommInitRank(&data->comms[thread_id], data->num_gpus, data->commId, data->thread_id)); +``` + +In this approach: +- Each thread gets its own NCCL rank (0, 1, 2...) +- Does not need explicit distribution of `uniqueId` since it uses a global variable. + +## Expected Output + +``` +Using 4 devices with pthreads +Creating 4 threads for NCCL communicators + Thread 0: Set device 0 and created stream + Thread 1: Set device 1 and created stream + Thread 2: Set device 2 and created stream + Thread 3: Set device 3 and created stream + Thread 0: NCCL communicator initialized + Thread 1: NCCL communicator initialized + Thread 2: NCCL communicator initialized + Thread 3: NCCL communicator initialized +All threads synchronized - communicators ready + Thread 0: Communicator rank 0 of 4 + Thread 1: Communicator rank 1 of 4 + Thread 2: Communicator rank 2 of 4 + Thread 3: Communicator rank 3 of 4 + Thread 0: Destroyed NCCL communicator + Thread 1: Destroyed NCCL communicator + Thread 2: Destroyed NCCL communicator + Thread 3: Destroyed NCCL communicator + Thread 0: Resources cleaned up + Thread 1: Resources cleaned up + Thread 2: Resources cleaned up + Thread 3: Resources cleaned up +All threads completed +Success +``` + +## When to Use pthread Approach + +### Ideal Use Cases +- **Thread-based applications**: When your application is already threaded +- **Single-node workloads**: All GPUs on one machine +- **Shared memory**: Need to share data structures between GPU contexts + +### When NOT to Use +- **Multi-node clusters**: Cannot scale beyond one node +- **Process isolation**: When GPU contexts should be isolated +- **Complex applications**: Multi-process approach may be cleaner + +## Performance Considerations + +- **Advantages**: + - Shared address space between threads + - Easier data sharing between GPU contexts + - No MPI overhead + +- **Disadvantages**: + - Thread synchronization complexity + - Limited to single node + +## Common Issues and Solutions + +1. **Thread synchronization errors**: + - Ensure all threads use the same NCCL unique ID + - Proper pthread synchronization (barriers, joins) + +2. **CUDA context conflicts**: + - Each thread must call `cudaSetDevice()` before CUDA operations + - Don't share CUDA streams between threads + +3. **Resource cleanup order**: + - Always destroy NCCL communicators before CUDA resources + - Synchronize streams before destroying communicators + +## Error Handling + +The example uses simplified error handling with CHECK macros: +- **CUDACHECK**: Exits immediately on CUDA errors +- **NCCLCHECK**: Exits immediately on NCCL errors +- **No async error checking**: Simplified for clarity +- **Thread safety**: Each thread handles its own errors + +## Highlighted Environment Variables + +- `NTHREADS`: Number of threads to create (defaults to number of GPUs) + +See examples/README.md for the full list. + +## Next Steps + +After understanding this example: +1. Try using the collective examples and add the pthread approach +2. Compare with MPI-based multi-process approach diff --git a/examples/01_communicators/02_one_device_per_pthread/main.cc b/examples/01_communicators/02_one_device_per_pthread/main.cc new file mode 100644 index 000000000..9dc6f7c8b --- /dev/null +++ b/examples/01_communicators/02_one_device_per_pthread/main.cc @@ -0,0 +1,196 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "nccl.h" +#include +#include +#include + +/** + * NCCL Pthread Example - One Device Per Thread (Simple Version) + * + * This example demonstrates the basic lifecycle of NCCL communicators in a + * multi-threaded environment. Each pthread manages one GPU device and shows + * how to properly create and destroy NCCL communicators. + * + * Key Learning Points: + * - NCCL communicator creation and destruction within threads + * - CUDA stream management per thread + * - Proper resource cleanup order + * + * This is a minimal example focusing purely on communicator lifecycle + * management without performing actual collective operations. + */ + +// Enhanced error checking macro for NCCL operations +// Provides detailed error information including the failed operation + +#define NCCLCHECK(cmd) \ + do { \ + ncclResult_t res = cmd; \ + if (res != ncclSuccess) { \ + fprintf(stderr, "Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, \ + ncclGetErrorString(res)); \ + fprintf(stderr, "Failed NCCL operation: %s\n", #cmd); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define CUDACHECK(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + fprintf(stderr, "Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \ + cudaGetErrorString(err)); \ + fprintf(stderr, "Failed CUDA operation: %s\n", #cmd); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +// Thread data structure to pass parameters +typedef struct { + int thread_id; + int num_gpus; + ncclUniqueId commId; + ncclComm_t *comms; +} threadData_t; + +void *thread_worker(void *arg) { + threadData_t *data = (threadData_t *)arg; + int thread_id = data->thread_id; + cudaStream_t stream; + + // ========================================================================= + // Set Device Context and Create Stream + // ========================================================================= + // Each thread must set its device context before any CUDA operations + CUDACHECK(cudaSetDevice(thread_id)); + CUDACHECK(cudaStreamCreate(&stream)); + + printf(" Thread %d: Set device %d and created stream\n", thread_id, + thread_id); + + // ========================================================================= + // Initialize NCCL Communicator + // ========================================================================= + // Each thread creates its own communicator using the shared unique ID + NCCLCHECK(ncclCommInitRank(&data->comms[thread_id], data->num_gpus, data->commId, + thread_id)); + + printf(" Thread %d: NCCL communicator initialized\n", thread_id); + + if (thread_id == 0) { + printf("All threads initialized - communicators ready\n"); + } + + // ========================================================================= + // Query Communicator Properties + // ========================================================================= + // Verify the communicator was created correctly + int comm_thread_id, comm_size; + NCCLCHECK(ncclCommUserRank(data->comms[thread_id], &comm_thread_id)); + NCCLCHECK(ncclCommCount(data->comms[thread_id], &comm_size)); + + printf(" Thread %d: Communicator thread_id %d of %d\n", thread_id, + comm_thread_id, comm_size); + + // Synchronize CUDA stream to ensure all GPU work is complete + if (stream != NULL) { + CUDACHECK(cudaStreamSynchronize(stream)); + } + + // ========================================================================= + // Cleanup Resources (Proper Order) + // ========================================================================= + // Destroy NCCL communicator FIRST (before CUDA resources) + // This is important - NCCL cleanup should happen before CUDA cleanup + if (data->comms[thread_id] != NULL) { + NCCLCHECK(ncclCommDestroy(data->comms[thread_id])); + printf(" Thread %d: Destroyed NCCL communicator\n", comm_thread_id); + } + + // Now destroy CUDA stream + if (stream != NULL) { + CUDACHECK(cudaStreamDestroy(stream)); + } + + printf(" Thread %d: Resources cleaned up\n", thread_id); + + return NULL; +} + +int main(int argc, char *argv[]) { + int num_gpus; + pthread_t *threads; + threadData_t *threadData; + ncclComm_t *comms; + ncclUniqueId commId; + + // ========================================================================= + // STEP 1: Initialize Variables and Check GPU Availability + // ========================================================================= + + CUDACHECK(cudaGetDeviceCount(&num_gpus)); + const char *nThreadsEnv = getenv("NTHREADS"); + if (nThreadsEnv) { + num_gpus = atoi(nThreadsEnv); + } + + if (num_gpus < 1) { + printf("No CUDA devices found\n"); + return EXIT_FAILURE; + } + + printf("Using %d devices with pthreads\n", num_gpus); + + // ========================================================================= + // STEP 2: Allocate Memory and Prepare Data Structures + // ========================================================================= + + threads = (pthread_t *)malloc(num_gpus * sizeof(pthread_t)); + threadData = (threadData_t *)malloc(num_gpus * sizeof(threadData_t)); + comms = (ncclComm_t *)malloc(num_gpus * sizeof(ncclComm_t)); + + // Generate unique ID for NCCL communicator initialization + NCCLCHECK(ncclGetUniqueId(&commId)); + + // ========================================================================= + // STEP 3: Create and Launch Pthread Threads + // ========================================================================= + + printf("Creating %d threads for NCCL communicators\n", num_gpus); + + for (int i = 0; i < num_gpus; i++) { + threadData[i].thread_id = i; + threadData[i].num_gpus = num_gpus; + threadData[i].commId = commId; + threadData[i].comms = comms; + + pthread_create(&threads[i], NULL, thread_worker, &threadData[i]); + } + + // ========================================================================= + // STEP 4: Wait for Thread Completion + // ========================================================================= + + for (int i = 0; i < num_gpus; i++) { + pthread_join(threads[i], NULL); + } + + printf("All threads completed\n"); + + // ========================================================================= + // STEP 5: Cleanup Resources + // ========================================================================= + + free(threads); + free(threadData); + free(comms); + + printf("Success\n"); + return 0; +} diff --git a/examples/01_communicators/03_one_device_per_process_mpi/Makefile b/examples/01_communicators/03_one_device_per_process_mpi/Makefile new file mode 100644 index 000000000..d12e2fc27 --- /dev/null +++ b/examples/01_communicators/03_one_device_per_process_mpi/Makefile @@ -0,0 +1,66 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +# This examples needs to be built with MPI support +MPI = 1 + +# Include common build rules +include ../../../makefiles/common.mk +include ../../../makefiles/examples.mk + +# Target executable +TARGET = one_device_per_process_mpi + +# Source files +SOURCES = main.cc +OBJECTS = $(SOURCES:.cc=.o) + +# Default target +all: $(TARGET) + +# Build executable +$(TARGET): $(OBJECTS) + $(MPICXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -o $@ + @echo "Built target $@" + +# Compile source files +%.o: %.cc + $(MPICXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@ + +# Test target +test: $(TARGET) + @echo "Testing $(TARGET)..." + @echo "Running with 1 process" + $(MPIRUN) -np 1 ./$(TARGET) + @echo "" + @echo "Running with 2 processes" + $(MPIRUN) -np 2 ./$(TARGET) + +# Clean build artifacts +clean: + rm -f $(OBJECTS) $(TARGET) + +# Install target +install: $(TARGET) + @mkdir -p $(PREFIX)/bin + cp $(TARGET) $(PREFIX)/bin/ + +# Help +help: + @echo "NCCL Example: One Device per Process (MPI)" + @echo "==========================================" + @echo "" + @echo "This example shows how to use ncclCommInitRank to create" + @echo "communicators for multiple GPUs using pthreads." + @echo "" + @echo "Targets:" + @echo " all - Build the example (default)" + @echo " test - Build and run tests with different process counts" + @echo " clean - Remove build artifacts" + @echo " install - Install to PREFIX/bin (default: /usr/local/bin)" + @echo " help - Show this help" + +.PHONY: all test clean install help diff --git a/examples/01_communicators/03_one_device_per_process_mpi/README.md b/examples/01_communicators/03_one_device_per_process_mpi/README.md new file mode 100644 index 000000000..5d3d2b40d --- /dev/null +++ b/examples/01_communicators/03_one_device_per_process_mpi/README.md @@ -0,0 +1,196 @@ + + +# NCCL Example: One Device per Process (MPI) + +This example demonstrates NCCL communicator lifecycle management using MPI, with +one GPU per MPI process. + +## Overview + +This example shows one of the most common NCCL deployment pattern: one GPU +device per process. This approach is ideal for distributed training across +multiple nodes and provides the foundation for scalable multi-GPU applications. +MPI is used as it provides a parallel launcher and broadcast functions. It is, +however, not a requirement for multi-node NCCL applications. + +Other approaches use server-client models or spawn parallel processes using +sockets. NCCL only requires that the unique ID is distributed among each +thread/process taking part in collective communication and all threads/processes +call some NCCL initialization function. + +## What This Example Does + +1. **Multi-node Support**: + - Determines local rank on each node automatically + - Maps MPI processes to GPUs on each node + - Uses `MPI_Comm_split_type` with `MPI_COMM_TYPE_SHARED` to assign each local + rank a GPU. + +2. **Communicator Creation**: + - Uses `ncclCommInitRank` with MPI-coordinated unique ID + - Rank `0` generates and broadcasts NCCL unique ID + - Each process joins the distributed communicator + +3. **Verification**: + - Displays MPI rank → NCCL rank → GPU device mapping + - Confirms successful initialization across all processes + +4. **Cleanup**: + - Proper resource cleanup order + - MPI synchronization for clean shutdown + +## Building and Running + +### Build +```shell +make MPI=1 [MPI_HOME=] [NCCL_HOME=] [CUDA_HOME=] +``` + +### Run example +```shell +mpirun -np ./one_device_per_process_mpi +``` + +### Run with NCCL debug output +```shell +NCCL_DEBUG=INFO mpirun -np ./one_device_per_process_mpi +``` + +## Code Walk-through + +This approach: +- Automatically handles multi-node GPU assignment +- Uses MPI for coordination and NCCL for GPU communication +- Supports both single-node and multi-node deployments + +### Unique ID Distribution +The NCCL unique ID must be shared with all process which call `ncclCommInitRank`. We use MPI for that: +```c +// Rank 0 generates unique ID +if (mpi_rank == 0) { + NCCLCHECK(ncclGetUniqueId(&nccl_id)); +} + +// Broadcast to all processes +MPI_Bcast(&nccl_id, sizeof(ncclUniqueId), MPI_BYTE, 0, MPI_COMM_WORLD); +``` + +### Key Function: Multi-node GPU assignment +```c +// Separate function to determine the node local rank via `MPI_Comm_split_type` +int local_rank = getLocalRank(MPI_COMM_WORLD); + +// Use the local rank as the GPU device number. This assumes you only start as many processes as available GPUs +CUDACHECK(cudaSetDevice(local_rank)); + +ncclComm_t comm; +int mpi_rank, mpi_size; // mpi_rank & mpi_size are set during MPI initialization +ncclUniqueId nccl_id; // nccl_id is generated and broadcasted as above + +// Initialize NCCL communicator across all processes +NCCLCHECK(ncclCommInitRank(&comm, mpi_size, nccl_id, mpi_rank)); +``` + +## Expected Output + +### Single Node (4 processes) +``` +Starting NCCL communicator lifecycle example with 4 processes + MPI initialized - Process 0 of 4 total processes + Found 4 CUDA devices on this node + MPI rank 0 assigned to CUDA device 0 +Rank 0 generated NCCL unique ID for all processes + Rank 0 received NCCL unique ID + Rank 0 created NCCL communicator + MPI rank 0 → NCCL rank 0/4 on GPU device 0 + +[Similar output for ranks 1-3] + +All communicators initialized successfully! Beginning cleanup... + Rank 0 destroyed NCCL communicator + +All NCCL communicators created and cleaned up properly! +This example demonstrated the complete NCCL communicator lifecycle. +Next steps: Try running NCCL collective operations (AllReduce, etc.) +``` + +### Multi-node (8 processes, 2 nodes) +``` +Starting NCCL communicator lifecycle example with 8 processes + MPI initialized - Process 0 of 8 total processes + MPI initialized - Process 1 of 8 total processes + MPI initialized - Process 2 of 8 total processes + MPI initialized - Process 3 of 8 total processes + MPI initialized - Process 4 of 8 total processes + MPI initialized - Process 5 of 8 total processes + MPI initialized - Process 6 of 8 total processes + MPI initialized - Process 7 of 8 total processes +... + + MPI rank 0 → NCCL rank 0/8 on GPU device 0 + MPI rank 1 → NCCL rank 1/8 on GPU device 1 + MPI rank 2 → NCCL rank 2/8 on GPU device 2 + MPI rank 3 → NCCL rank 3/8 on GPU device 3 + MPI rank 4 → NCCL rank 4/8 on GPU device 0 + MPI rank 5 → NCCL rank 5/8 on GPU device 1 + MPI rank 6 → NCCL rank 6/8 on GPU device 2 + MPI rank 7 → NCCL rank 7/8 on GPU device 3 + +All NCCL communicators created and cleaned up properly! +``` + +## When to Use MPI Approach + +### Ideal Use Cases +- **Multi-node clusters**: Scales across multiple machines +- **Production deployments**: Industry standard for distributed training, + inference, and most HPC codes +- **Process isolation**: Each GPU in separate process for robustness +- **Large scale**: Supports thousands of GPUs + +### When NOT to Use +- **Single-node testing**: Simpler approaches available +- **No MPI available**: Some environments don't support MPI +- **Shared memory needs**: Single-process approaches may be simpler + +## Performance Considerations + +- **Advantages**: + - MPI has been optimized for large parallel startup. + - Industry standard deployment pattern + - Optimal for large-scale training + +- **Disadvantages**: + - MPI setup complexity + - Inter-process communication overhead + - Requires MPI runtime environment + +## Common Issues and Solutions + +1. **More MPI processes than GPUs on a node**: + - The example reports an error if local rank exceeds available devices + - Use fewer processes per node or more GPUs + +2. **MPI broadcast hangs**: + - Ensure all ranks participate in collective operations + - Check MPI installation and network connectivity + +3. **Multi-node communication fails**: + - Check firewall settings and network configuration + - Set `NCCL_SOCKET_IFNAME` to specify network interface + +## Error Handling + +The example uses simplified error handling with CHECK macros: +- **CUDACHECK**: Exits immediately on CUDA errors +- **NCCLCHECK**: Exits immediately on NCCL errors +- **No async error checking**: Simplified for clarity +- **No global error coordination**: Each process exits on its own errors + +## Next Steps + +After understanding this example: +1. Try running collective operations (AllReduce, AllGather, etc.) +2. Experiment with multi-node deployments diff --git a/examples/01_communicators/03_one_device_per_process_mpi/main.cc b/examples/01_communicators/03_one_device_per_process_mpi/main.cc new file mode 100644 index 000000000..44a2377bf --- /dev/null +++ b/examples/01_communicators/03_one_device_per_process_mpi/main.cc @@ -0,0 +1,249 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "mpi.h" +#include "nccl.h" +#include +#include +#include +#include + +/** + * NCCL Example: One Device per Process with MPI + * ============================================= + * + * LEARNING OBJECTIVE: + * This example teaches the fundamental NCCL pattern: one GPU device per MPI + * process. This is the most common deployment pattern for multi-GPU distributed + * training. + * + * WHAT THIS CODE DEMONSTRATES: + * - How to initialize NCCL communicators across multiple processes + * - Proper GPU assignment in both single-node and multi-node environments + * - Complete NCCL communicator lifecycle management + * - Error handling best practices for production code + * + * STEP-BY-STEP PROCESS: + * 1. MPI Setup: Initialize MPI and determine process layout + * 2. GPU Assignment: Map each process to a local GPU device + * 3. NCCL ID Sharing: Rank 0 creates unique ID, broadcasts to all processes + * 4. Communicator Creation: Each process joins the NCCL communicator + * 5. Verification: Query and verify communicator properties + * 6. Clean Shutdown: Properly destroy all resources in correct order + * + * MULTI-NODE INTELLIGENCE: + * - Automatically detects which processes are on the same physical node + * - Assigns local GPU indices (0, 1, 2, 3...) to processes on each node + * - Uses MPI_Comm_split_type with MPI_COMM_TYPE_SHARED for robust node + * identification + * - Leverages MPI's native shared memory detection for optimal performance + * + * USAGE EXAMPLES: + * Single node (4 GPUs): mpirun -np 4 ./one_device_per_process_mpi + * + * EXPECTED OUTPUT: + * Each process will report: MPI rank → NCCL rank → GPU device assignment + * Success message confirms all communicators were created properly + */ + +// Enhanced error checking macro for NCCL operations +// Provides detailed error information including the failed operation + +#define NCCLCHECK(cmd) \ + do { \ + ncclResult_t res = cmd; \ + if (res != ncclSuccess) { \ + fprintf(stderr, "Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, \ + ncclGetErrorString(res)); \ + fprintf(stderr, "Failed NCCL operation: %s\n", #cmd); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define CUDACHECK(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + fprintf(stderr, "Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \ + cudaGetErrorString(err)); \ + fprintf(stderr, "Failed CUDA operation: %s\n", #cmd); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +// ============================================================================= +// LOCAL RANK UTILITY FUNCTION - For Multi-Node GPU Assignment +// ============================================================================= + +/** + * Determine the local rank of this process on its physical node + * + * Algorithm: + * 1. Split the communicator based on shared memory (i.e., nodes) + * 2. Get the rank within the node communicator + * 3. This rank becomes the local rank for GPU assignment + * + * @param comm The MPI communicator to use for determining local rank + * @return Local rank (0, 1, 2...) for GPU assignment, or -1 on error + */ +int getLocalRank(MPI_Comm comm) { + + int world_size; + MPI_Comm_size(comm, &world_size); + + int world_rank; + MPI_Comm_rank(comm, &world_rank); + + // Split the communicator based on shared memory (i.e., nodes) + MPI_Comm node_comm; + MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, world_rank, MPI_INFO_NULL, + &node_comm); + + // Get the rank and size within the node communicator + int node_rank, node_size; + MPI_Comm_rank(node_comm, &node_rank); + MPI_Comm_size(node_comm, &node_size); + + // Clean up the node communicator + MPI_Comm_free(&node_comm); + + return node_rank; +} + +// ============================================================================= +// MAIN FUNCTION - NCCL Communicator Lifecycle Example +// ============================================================================= + +int main(int argc, char *argv[]) { + // Variables for MPI, CUDA, and NCCL components + int mpi_rank, mpi_size, local_rank; + int num_gpus = 0; + ncclComm_t comm = NULL; + cudaStream_t stream = NULL; + ncclUniqueId nccl_id; + + // ========================================================================= + // STEP 1: Initialize MPI and determine process layout + // ========================================================================= + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + + + if (mpi_rank == 0) { + printf("Starting NCCL communicator lifecycle example with %d processes\n", + mpi_size); + } + // Determine which local GPU this process should use + local_rank = getLocalRank(MPI_COMM_WORLD); + + printf(" MPI initialized - Process %d of %d total processes\n", mpi_rank, + mpi_size); + + // ========================================================================= + // STEP 2: Setup CUDA device for this process + // ========================================================================= + + // Check how many CUDA devices are available on this node + CUDACHECK(cudaGetDeviceCount(&num_gpus)); + printf(" Found %d CUDA devices on this node\n", num_gpus); + + if (num_gpus == 0) { + fprintf(stderr, "ERROR: No CUDA devices found on this node!\n"); + exit(EXIT_FAILURE); + } + + if (local_rank >= num_gpus) { + fprintf(stderr, + "ERROR: Process %d needs GPU %d but only %d devices available\n", + mpi_rank, local_rank, num_gpus); + exit(EXIT_FAILURE); + } + + // Assign this process to its designated GPU device + CUDACHECK(cudaSetDevice(local_rank)); + + // Create CUDA stream for GPU operations + CUDACHECK(cudaStreamCreate(&stream)); + + printf(" MPI rank %d assigned to CUDA device %d\n", mpi_rank, + local_rank); + + // ========================================================================= + // STEP 3: Initialize NCCL communicator + // ========================================================================= + + // Generate NCCL unique ID (only rank 0 needs to do this) + if (mpi_rank == 0) { + NCCLCHECK(ncclGetUniqueId(&nccl_id)); + printf("Rank 0 generated NCCL unique ID for all processes\n"); + } + + // Share the unique ID with all processes using MPI broadcast + MPI_Bcast(&nccl_id, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0, MPI_COMM_WORLD); + printf("INFO: Rank %d received NCCL unique ID\n", mpi_rank); + + // Create NCCL communicator for this process + // This is where each process joins the distributed NCCL communicator + NCCLCHECK(ncclCommInitRank(&comm, mpi_size, nccl_id, mpi_rank)); + printf(" Rank %d created NCCL communicator\n", mpi_rank); + + // ========================================================================= + // STEP 4: Verify communicator setup + // ========================================================================= + + // Query communicator properties to verify everything is set up correctly + int comm_rank, comm_size, comm_device; + NCCLCHECK(ncclCommUserRank(comm, &comm_rank)); + NCCLCHECK(ncclCommCount(comm, &comm_size)); + NCCLCHECK(ncclCommCuDevice(comm, &comm_device)); + + printf(" MPI rank %d → NCCL rank %d/%d on GPU device %d\n", mpi_rank, + comm_rank, comm_size, comm_device); + + // Give all processes a chance to finish their printf + MPI_Barrier(MPI_COMM_WORLD); + + // ========================================================================= + // STEP 5: Clean shutdown and resource cleanup + // ========================================================================= + + if (mpi_rank == 0) { + printf( + "\nAll communicators initialized successfully! Beginning cleanup...\n"); + } + + // Synchronize CUDA stream to ensure all GPU work is complete + if (stream != NULL) { + CUDACHECK(cudaStreamSynchronize(stream)); + } + + // Destroy NCCL communicator FIRST (before CUDA resources) + // This is important - NCCL cleanup should happen before CUDA cleanup + if (comm != NULL) { + NCCLCHECK(ncclCommDestroy(comm)); + printf(" Rank %d destroyed NCCL communicator\n", mpi_rank); + } + + // Now destroy CUDA stream + if (stream != NULL) { + CUDACHECK(cudaStreamDestroy(stream)); + } + + if (mpi_rank == 0) { + printf( + "\nAll NCCL communicators created and cleaned up properly!\n"); + printf("This example demonstrated the complete NCCL communicator " + "lifecycle.\n"); + printf("Next steps: Try running NCCL collective operations (AllReduce, " + "etc.)\n"); + } + + MPI_Finalize(); + return 0; +} diff --git a/examples/01_communicators/Makefile b/examples/01_communicators/Makefile new file mode 100644 index 000000000..1b89e5904 --- /dev/null +++ b/examples/01_communicators/Makefile @@ -0,0 +1,59 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +# Include common build rules +include ../../makefiles/common.mk + +# NCCL Fundamental Examples +EXAMPLES = 01_multiple_devices_single_process 02_one_device_per_pthread + +ifeq ($(MPI), 1) +EXAMPLES += 03_one_device_per_process_mpi +endif + +# Default target +all: $(EXAMPLES) + +# Build individual examples +$(EXAMPLES): + $(MAKE) -C $@ + +# Clean all build artifacts +clean: + for example in $(EXAMPLES); do \ + $(MAKE) -C $$example clean; \ + done +ifneq ($(MPI),1) + $(MAKE) -C 03_one_device_per_process_mpi clean +endif + +# Test all examples +test: all + for example in $(EXAMPLES); do \ + echo "Testing $$example..."; \ + $(MAKE) -C $$example test || exit 1; \ + done + +# Help +help: + @echo "NCCL Communicator Init Examples" + @echo "===============================" + @echo "" + @echo "Targets:" + @echo " all - Build all examples" + @echo " clean - Clean all build artifacts" + @echo " test - Test all examples" + @echo " help - Show this help" + @echo "" + @echo "Examples:" + @echo " 01_multiple_devices_single_process - Create communicators using multiple GPUs in a single thread" + @echo " 02_one_device_per_pthread - Create communicators using one GPU per thread" + @echo " 03_one_device_per_process_mpi - Create communicators using one GPU per MPI process" + @echo "" + @echo "To build/run individual examples:" + @echo " make -C 01_multiple_devices_single_process" + +.PHONY: all clean test help $(EXAMPLES) diff --git a/examples/01_communicators/README.md b/examples/01_communicators/README.md new file mode 100644 index 000000000..1c2218c9e --- /dev/null +++ b/examples/01_communicators/README.md @@ -0,0 +1,107 @@ + + +# NCCL Communicator Examples + +## Overview +This directory contains minimal examples that demonstrate NCCL communicator +lifecycle management (creation, query, and destruction) using different +initialization patterns. + +## Examples + +### [01_multiple_devices_single_process](01_multiple_devices_single_process/) +**Multiple Devices Single Process** +- **Pattern**: Single process manages all GPUs +- **API**: `ncclCommInitAll` (no external coordination) +- **Use case**: Simple single-node applications +- **Key features**: + - Simplest initialization method + - No MPI or threading required + - Automatic rank assignment (0 to n-1) + - Cannot span multiple nodes + +**Run command:** +```shell +./01_multiple_devices_single_process/multiple_devices_single_process +``` + +### [02_one_device_per_pthread](02_one_device_per_pthread/) +**One Device per Thread with pthreads** +- **Pattern**: One thread per GPU within single process +- **API**: `ncclCommInitRank` with pthread coordination +- **Use case**: Single-node multi-GPU, thread-based parallelism +- **Key features**: + - pthread barriers for synchronization + - Shared memory for unique ID + - Lower overhead than multi-process + - Cannot span multiple nodes + +**Run command:** +```shell +[NTHREADS=n] ./02_one_device_per_pthread/one_device_per_pthread +``` + +### [03_one_device_per_process_mpi](03_one_device_per_process_mpi/) +**One Device per Process with MPI** +- **Pattern**: One MPI process per GPU +- **API**: `ncclCommInitRank` with MPI coordination +- **Use case**: Multi-node clusters, distributed training +- **Key features**: + - MPI broadcast for unique ID distribution + - Process-to-GPU mapping by local MPI ranks + - Scalable to multiple nodes + +**Run command:** +```shell +mpirun -np ./03_one_device_per_process_mpi/one_device_per_process_mpi +``` + +## Choosing the Right Approach + +| Feature | ncclCommInitAll | pthread | MPI | +|------------------------|-----------------|------------------|----------| +| **Multi-node support** | ✗ | ✗ | ✓ | +| **Process isolation** | ✗ | ✗ | ✓ | +| **Setup complexity** | Low | Medium | High | +| **Memory overhead** | Low | Medium | High | +| **Best for** | Simple test | Single-node apps | Clusters | + +### When to use each: +- **ncclCommInitAll**: Development, testing, simple single-node apps +- **pthread**: Single-node with thread-based parallelism needs +- **MPI**: Production distributed training, multi-node setups + +## Building + +### **Quick Start** +```shell +# Build all examples [or single directory] +make [directory] + +# Test all examples +make test +``` + +### **Individual Examples** +```shell +# Build specific example +make 01_multiple_devices_single_process +make 02_one_device_per_pthread +make 03_one_device_per_process_mpi + +# Test individual example +cd 01_multiple_devices_single_process && make test +cd 02_one_device_per_pthread && make test +cd 03_one_device_per_process_mpi && make test +``` + +## References +- [NVIDIA NCCL User Guide + Examples](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html) +- [NCCL API + Reference](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html) +- [CUDA Programming + Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/) +- [MPI Standard](https://www.mpi-forum.org/docs/) diff --git a/examples/02_point_to_point/01_ring_pattern/Makefile b/examples/02_point_to_point/01_ring_pattern/Makefile new file mode 100644 index 000000000..52b51dbfa --- /dev/null +++ b/examples/02_point_to_point/01_ring_pattern/Makefile @@ -0,0 +1,57 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +# Include common build rules +include ../../../makefiles/common.mk +include ../../../makefiles/examples.mk + +# Target executable +TARGET = ring_pattern + +# Source files +SOURCES = main.cc +OBJECTS = $(SOURCES:.cc=.o) + +# Default target +all: $(TARGET) + +# Build executable +$(TARGET): $(OBJECTS) + $(CXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -o $@ + @echo "Built target $@" + +# Compile source files +%.o: %.cc + $(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@ + +# Test target +test: $(TARGET) + @echo "Testing $(TARGET)..." + @echo "Running with all available GPUs" + ./$(TARGET) + +# Clean build artifacts +clean: + rm -f $(OBJECTS) $(TARGET) + +# Install target +install: $(TARGET) + @mkdir -p $(PREFIX)/bin + cp $(TARGET) $(PREFIX)/bin/ + +# Help +help: + @echo "NCCL Example: P2P Ring Pattern" + @echo "==============================================" + @echo "" + @echo "Targets:" + @echo " all - Build the example (default)" + @echo " test - Build and run test with all GPUs" + @echo " clean - Remove build artifacts" + @echo " install - Install to PREFIX/bin (default: /usr/local/bin)" + @echo " help - Show this help" + +.PHONY: all test clean install help diff --git a/examples/02_point_to_point/01_ring_pattern/README.md b/examples/02_point_to_point/01_ring_pattern/README.md new file mode 100644 index 000000000..7ed046e59 --- /dev/null +++ b/examples/02_point_to_point/01_ring_pattern/README.md @@ -0,0 +1,149 @@ + + +# NCCL Ring Communication Pattern Example + +This example demonstrates a ring communication pattern using NCCL P2P +operations. It runs on a single node where a single process manages all GPUs and +data flows in a circular pattern. + +## Overview + +The ring communication pattern creates a circular data flow where each GPU sends +data to its "next" neighbor and receives from its "previous" neighbor in the +ring. This example uses `ncclCommInitAll` for simplified single-threaded, +single-process multi-GPU setup. + +## What This Example Does + +1. **Detects and initializes all available GPUs** using `ncclCommInitAll` for + simplified single-process setup +2. **Creates ring topology** where each GPU calculates its next and previous + neighbors using modulo +3. **Executes simultaneous point-to-point communication** with each GPU sending + to next and receiving from previous +4. **Verifies data correctness** by checking that each GPU received the expected + data from its predecessor + +## Building and Running + +### Build the Example +```bash +cd examples/02_point_to_point/01_ring_pattern +make [NCCL_HOME=] [CUDA_HOME=] +``` + +### Run with All Available GPUs +```bash +./ring_pattern +``` + +## Code Walk-through + +### Ring Topology Setup + +The example calculates ring neighbors using modulo arithmetic: + +```cpp +for (int i = 0; i < num_gpus; i++) { + int next = (i + 1) % num_gpus; // Next neighbor in ring + int prev = (i - 1 + num_gpus) % num_gpus; // Previous neighbor in ring +} +``` + +### Simultaneous Communication + +Uses `ncclGroupStart/End` to prevent deadlocks when scheduling all send and +receive operations: + +```cpp +float **d_sendbuff; // device side send and receive buffer are allocated through cudaMalloc +float **d_recvbuff; +size_t count; // count is set to the number of floats to be sent (usually the size of the buffers) +ncclComm_t *comms; // comms are set during ncclCommInitAll +cudaStream_t *streams; // streams are set in cudaStreamCreate + +// Each GPU simultaneously sends to next and receives from previous +NCCLCHECK(ncclGroupStart()); +for (int i = 0; i < num_gpus; i++) { + int next = (i + 1) % num_gpus; + int prev = (i - 1 + num_gpus) % num_gpus; + + NCCLCHECK(ncclSend(d_sendbuff[i], count, ncclFloat, next, comms[i], streams[i])); + NCCLCHECK(ncclRecv(d_recvbuff[i], count, ncclFloat, prev, comms[i], streams[i])); +} +NCCLCHECK(ncclGroupEnd()); +``` + +## Expected Output + +``` +Starting NCCL ring communication example +Using 4 GPUs for ring communication +Preparing data structures +Initializing NCCL communicators +All communicators initialized successfully +Creating CUDA streams and verifying setup + GPU 0 -> NCCL rank 0/4 on CUDA device 0 + GPU 1 -> NCCL rank 1/4 on CUDA device 1 + GPU 2 -> NCCL rank 2/4 on CUDA device 2 + GPU 3 -> NCCL rank 3/4 on CUDA device 3 +Setting up ring topology +Data flow -> GPU 0 -> GPU 1 -> ... -> GPU 3 -> GPU 0 +Ring transfer with 268435456 elements (1.00 GB per GPU) +Allocating and initializing buffers +Executing ring communication + GPU 0 sends to GPU 1, receives from GPU 3 + GPU 1 sends to GPU 2, receives from GPU 0 + GPU 2 sends to GPU 3, receives from GPU 1 + GPU 3 sends to GPU 0, receives from GPU 2 +Ring communication completed successfully +Verifying data correctness + GPU 0 received data from GPU 3: CORRECT + GPU 1 received data from GPU 0: CORRECT + GPU 2 received data from GPU 1: CORRECT + GPU 3 received data from GPU 2: CORRECT +SUCCESS - All GPUs received correct data +Cleaning up resources +Example completed successfully! +``` + +## When to Use + +- **Learning NCCL fundamentals**: Understanding point-to-point communication + patterns +- **Algorithm development**: Building custom collective operations based on + point to point communications +- **Single-node applications**: Pipeline parallelism or custom data distribution + patterns + +## Key Insights +- `ncclCommInitAll` simplifies single-node multi-GPU setup +- No MPI or pthreads needed for single-node patterns +- Ring pattern enables circular data flow among all GPUs +- `ncclGroupStart/End` prevents deadlock in simultaneous operations +- Each GPU both sends and receives in parallel + +## Common Issues and Solutions + +### Issue: Deadlock without group operations +**Solution:** Always use `ncclGroupStart()` and `ncclGroupEnd()` when performing +simultaneous send/recv operations. + +### Issue: Verification failures +**Solution:** Check ring topology calculations and data initialization patterns. +Ensure correct neighbor calculations. + +## Error Handling + +This example uses comprehensive error checking with `NCCLCHECK` and `CUDACHECK` +macros that immediately exit on any failure. In production code, consider more +graceful error handling and recovery mechanisms. + +## Next Steps + +After this example, try: +- **Collective operations**: Examples in `03_collectives/` +- **Multi-node approach**: Use the MPI implementation from `01_communicators` to + send data across nodes. diff --git a/examples/02_point_to_point/01_ring_pattern/main.cc b/examples/02_point_to_point/01_ring_pattern/main.cc new file mode 100644 index 000000000..2f317b9b5 --- /dev/null +++ b/examples/02_point_to_point/01_ring_pattern/main.cc @@ -0,0 +1,273 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "nccl.h" +#include +#include +#include +#include + +/* + * NCCL Ring Pattern Example - Educational Version + * + * This example demonstrates the fundamental ring communication pattern using + * NCCL's point-to-point operations. Understanding ring patterns is essential + * for NCCL programming as they form the basis of many collective algorithms. + * + * Learning Objectives: + * - Understand ring topology and neighbor communication + * - Learn NCCL point-to-point send/recv operations + * - See how data flows in a ring pattern + * - Practice deadlock avoidance with ncclGroup operations + * - Understand single-process multi-GPU patterns + * + */ + +// Enhanced error checking macro for NCCL operations +// Provides detailed error information including the failed operation +#define NCCLCHECK(cmd) \ + do { \ + ncclResult_t res = cmd; \ + if (res != ncclSuccess) { \ + fprintf(stderr, "Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, \ + ncclGetErrorString(res)); \ + fprintf(stderr, "Failed NCCL operation: %s\n", #cmd); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define CUDACHECK(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + fprintf(stderr, "Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \ + cudaGetErrorString(err)); \ + fprintf(stderr, "Failed CUDA operation: %s\n", #cmd); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +int main(int argc, char *argv[]) { + // ======================================================================== + // STEP 1: Initialize Environment and Detect GPUs + // ======================================================================== + + int num_gpus = 0; + ncclComm_t *comms = NULL; + cudaStream_t *streams = NULL; + float **h_sendbuff = NULL; + float **h_recvbuff = NULL; + float **d_sendbuff = NULL; + float **d_recvbuff = NULL; + + printf("Starting NCCL ring communication example\n"); + + // Get number of available CUDA devices + CUDACHECK(cudaGetDeviceCount(&num_gpus)); + + if (num_gpus == 0) { + fprintf(stderr, "No CUDA devices found\n"); + return 1; + } + + if (num_gpus < 2) { + printf("At least 2 GPU are necessary to create inter-GPU traffic\n"); + printf("Found only %d GPU(s) - pattern will be limited\n", num_gpus); + } + + printf("Using %d GPUs for ring communication\n", num_gpus); + + // ======================================================================== + // STEP 2: Prepare Data Structures and Device List + // ======================================================================== + + printf("Preparing data structures\n"); + + // Create device list (use all available devices) + int *devices = (int *)malloc(num_gpus * sizeof(int)); + for (int i = 0; i < num_gpus; i++) { + devices[i] = i; + } + + // Allocate communicators, streams, and buffer pointers + comms = (ncclComm_t *)malloc(num_gpus * sizeof(ncclComm_t)); + streams = (cudaStream_t *)malloc(num_gpus * sizeof(cudaStream_t)); + h_sendbuff = (float **)malloc(num_gpus * sizeof(float *)); + h_recvbuff = (float **)malloc(num_gpus * sizeof(float *)); + d_sendbuff = (float **)malloc(num_gpus * sizeof(float *)); + d_recvbuff = (float **)malloc(num_gpus * sizeof(float *)); + + // ======================================================================== + // STEP 3: Initialize NCCL Communicators + // ======================================================================== + + /* + * ncclCommInitAll is the simplest way to initialize NCCL communicators + * for single-process, multi-GPU scenarios. It automatically: + * - Creates one communicator per GPU + * - Assigns ranks sequentially (GPU 0 = rank 0, GPU 1 = rank 1, etc.) + */ + printf("Initializing NCCL communicators\n"); + NCCLCHECK(ncclCommInitAll(comms, num_gpus, devices)); + printf("All communicators initialized successfully\n"); + + // ======================================================================== + // STEP 4: Create Streams and Verify Communicator Setup + // ======================================================================== + + printf("Creating CUDA streams and verifying setup\n"); + + // Create streams and verify communicator info + for (int i = 0; i < num_gpus; i++) { + CUDACHECK(cudaSetDevice(devices[i])); + CUDACHECK(cudaStreamCreate(&streams[i])); + + // Query communicator information for verification + int rank, size, device; + NCCLCHECK(ncclCommUserRank(comms[i], &rank)); + NCCLCHECK(ncclCommCount(comms[i], &size)); + NCCLCHECK(ncclCommCuDevice(comms[i], &device)); + + printf(" GPU %d -> NCCL rank %d/%d on CUDA device %d\n", i, rank, size, + device); + } + + // ======================================================================== + // STEP 5: Set Up Ring Topology and Allocate Buffers + // ======================================================================== + + printf("Setting up ring topology\n"); + printf("Data flow -> GPU 0 -> ... -> GPU %d -> GPU 0\n", num_gpus - 1); + + // Test with 1GB of data + const size_t count = 256 * 1024 * 1024; // 256M floats = 1GB + const size_t size_bytes = count * sizeof(float); + + printf("Ring transfer with %zu elements (%.2f GB per GPU)\n", count, + size_bytes / (1024.0 * 1024.0 * 1024.0)); + + // Allocate buffers for each GPU + printf("Allocating and initializing buffers\n"); + for (int i = 0; i < num_gpus; i++) { + CUDACHECK(cudaSetDevice(devices[i])); + + h_sendbuff[i] = (float *)malloc(size_bytes); + h_recvbuff[i] = (float *)malloc(size_bytes); + CUDACHECK(cudaMalloc((void **)&d_sendbuff[i], size_bytes)); + CUDACHECK(cudaMalloc((void **)&d_recvbuff[i], size_bytes)); + + // Initialize data with GPU-specific pattern for verification + for (size_t j = 0; j < count; j++) { + h_sendbuff[i][j] = (float)(i * 1000 + j % 1000); + } + CUDACHECK(cudaMemcpy(d_sendbuff[i], h_sendbuff[i], size_bytes, + cudaMemcpyHostToDevice)); + } + + // ======================================================================== + // STEP 6: Execute Ring Communication Pattern + // ======================================================================== + + /* + * The ring communication uses ncclGroup operations to avoid deadlock. + * Without grouping, if all GPUs tried to send first, they would deadlock + * waiting for receivers. Grouping allows NCCL to execute operations + * in the optimal order. + */ + printf("Executing ring communication\n"); + + // NOTE: ncclGroupStart and ncclGroupEnd are essential to avoid deadlock + // when using ncclCommInitAll! + NCCLCHECK(ncclGroupStart()); + for (int i = 0; i < num_gpus; i++) { + int next = (i + 1) % num_gpus; + int prev = (i - 1 + num_gpus) % num_gpus; + printf(" GPU %d sends to GPU %d, receives from GPU %d\n", i, next, prev); + + // Each GPU simultaneously sends to next and receives from previous + NCCLCHECK( + ncclSend(d_sendbuff[i], count, ncclFloat, next, comms[i], streams[i])); + NCCLCHECK( + ncclRecv(d_recvbuff[i], count, ncclFloat, prev, comms[i], streams[i])); + } + NCCLCHECK(ncclGroupEnd()); + + // Synchronize all streams to ensure communication completes + for (int i = 0; i < num_gpus; i++) { + CUDACHECK(cudaSetDevice(devices[i])); + CUDACHECK(cudaStreamSynchronize(streams[i])); + } + + printf("Ring communication completed successfully\n"); + + // ======================================================================== + // STEP 7: Verify Data Correctness and Report Results + // ======================================================================== + + printf("Verifying data correctness\n"); + bool all_correct = true; + + for (int i = 0; i < num_gpus; i++) { + CUDACHECK(cudaSetDevice(devices[i])); + CUDACHECK(cudaMemcpy(h_recvbuff[i], d_recvbuff[i], size_bytes, + cudaMemcpyDeviceToHost)); + + int prev = (i - 1 + num_gpus) % num_gpus; + // Verify that GPU i received data from GPU prev + float expected = (float)(prev * 1000); + bool correct = (h_recvbuff[i][0] == expected); + + printf(" GPU %d received data from GPU %d: %s\n", i, prev, + correct ? "CORRECT" : "ERROR"); + + if (!correct) { + all_correct = false; + printf(" Expected %.0f, got %.0f\n", expected, h_recvbuff[i][0]); + } + } + + if (all_correct) { + printf("SUCCESS - All GPUs received correct data\n"); + } else { + printf("FAILURE - Data verification failed\n"); + } + + // ======================================================================== + // STEP 8: Cleanup Resources + // ======================================================================== + + printf("Cleaning up resources\n"); + + // Free buffers + for (int i = 0; i < num_gpus; i++) { + CUDACHECK(cudaSetDevice(devices[i])); + free(h_sendbuff[i]); + free(h_recvbuff[i]); + CUDACHECK(cudaFree(d_sendbuff[i])); + CUDACHECK(cudaFree(d_recvbuff[i])); + } + + // Destroy communicators and streams + for (int i = 0; i < num_gpus; i++) { + NCCLCHECK(ncclCommDestroy(comms[i])); + CUDACHECK(cudaSetDevice(devices[i])); + CUDACHECK(cudaStreamDestroy(streams[i])); + } + + // Free allocated memory + free(devices); + free(comms); + free(streams); + free(h_sendbuff); + free(h_recvbuff); + free(d_sendbuff); + free(d_recvbuff); + + printf("Example completed successfully!\n"); + + return 0; +} diff --git a/examples/02_point_to_point/Makefile b/examples/02_point_to_point/Makefile new file mode 100644 index 000000000..0310c0471 --- /dev/null +++ b/examples/02_point_to_point/Makefile @@ -0,0 +1,47 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +# NCCL Fundamental Examples +EXAMPLES = 01_ring_pattern + +# Default target +all: $(EXAMPLES) + +# Build individual examples +$(EXAMPLES): + $(MAKE) -C $@ + +# Clean all build artifacts +clean: + for example in $(EXAMPLES); do \ + $(MAKE) -C $$example clean; \ + done + +# Test all examples +test: all + for example in $(EXAMPLES); do \ + echo "Testing $$example..."; \ + $(MAKE) -C $$example test || exit 1; \ + done + +# Help +help: + @echo "NCCL Point to Point Examples" + @echo "============================" + @echo "" + @echo "Targets:" + @echo " all - Build all examples" + @echo " clean - Clean all build artifacts" + @echo " test - Test all examples" + @echo " help - Show this help" + @echo "" + @echo "Examples:" + @echo " 01_ring_pattern - Use send and receive operations to form a ring pattern" + @echo "" + @echo "To build/run individual examples:" + @echo " make -C 01_ring_pattern" + +.PHONY: all clean test $(EXAMPLES) diff --git a/examples/02_point_to_point/README.md b/examples/02_point_to_point/README.md new file mode 100644 index 000000000..26f1cc555 --- /dev/null +++ b/examples/02_point_to_point/README.md @@ -0,0 +1,65 @@ + + +# NCCL Point-to-Point Communication Examples + +## Overview +This directory contains minimal examples that demonstrate NCCL point-to-point +(P2P) communication patterns on a single node. These examples focus on clarity +and correct communicator usage, resource management, and verification. + +## Examples + +### [01_ring_pattern](01_ring_pattern/) +**Ring Communication Pattern** +- **Pattern**: Circular data flow among all GPUs +- **API**: `ncclCommInitAll` with P2P operations (`ncclSend`/`ncclRecv`) +- **Use case**: Learning P2P communication; pipeline/data movement patterns on a + single node +- **Key features**: + - Initializes all GPUs in a single process + - Computes ring neighbors with modulo arithmetic + - Uses `ncclGroupStart/End` to prevent deadlocks + - Verifies data correctness after transfers + +## Choosing the Right Pattern + +*Scenario* : Pipeline parallel training needs to send data from one GPU to +another +*Addresses* : Individual transfers between two ranks +*Dependencies* : A functional NCCL library and its dependencies + +### Why `ncclCommInitAll` here? +For single-node collective examples we use `ncclCommInitAll` as it creates a +clique of communicators in one call. + +```c +// Initialize all GPUs in one call +ncclComm_t* comms; +int num_gpus; +NCCLCHECK(ncclCommInitAll(comms, num_gpus, NULL)); +``` + +## Building + +### **Quick Start** +```shell +# Build example by directory name +make 01_ring_pattern +``` + +### **Individual Examples** +```shell +# Build and run the ring pattern +cd 01_ring_pattern && make +./ring_pattern +``` + +## References +- [NCCL User Guide: + Examples](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html) +- [NCCL API + Reference](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html) +- [CUDA Programming + Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/) diff --git a/examples/03_collectives/01_allreduce/Makefile b/examples/03_collectives/01_allreduce/Makefile new file mode 100644 index 000000000..9972b91cd --- /dev/null +++ b/examples/03_collectives/01_allreduce/Makefile @@ -0,0 +1,57 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +# Include common build rules +include ../../../makefiles/common.mk +include ../../../makefiles/examples.mk + +# Target executable +TARGET = allreduce + +# Source files +SOURCES = main.cc +OBJECTS = $(SOURCES:.cc=.o) + +# Default target +all: $(TARGET) + +# Build executable +$(TARGET): $(OBJECTS) + $(CXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -o $@ + @echo "Built target $@" + +# Compile source files +%.o: %.cc + $(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@ + +# Test target +test: $(TARGET) + @echo "Testing $(TARGET)..." + @echo "Running with all available GPUs" + ./$(TARGET) + +# Clean build artifacts +clean: + rm -f $(OBJECTS) $(TARGET) + +# Install target +install: $(TARGET) + @mkdir -p $(PREFIX)/bin + cp $(TARGET) $(PREFIX)/bin/ + +# Help +help: + @echo "NCCL Example: Allreduce" + @echo "==============================================" + @echo "" + @echo "Targets:" + @echo " all - Build the example (default)" + @echo " test - Build and run test with all GPUs" + @echo " clean - Remove build artifacts" + @echo " install - Install to PREFIX/bin (default: /usr/local/bin)" + @echo " help - Show this help" + +.PHONY: all test clean install help diff --git a/examples/03_collectives/01_allreduce/README.md b/examples/03_collectives/01_allreduce/README.md new file mode 100644 index 000000000..42bf43aad --- /dev/null +++ b/examples/03_collectives/01_allreduce/README.md @@ -0,0 +1,141 @@ + + +# NCCL AllReduce Collective Operation Example + +This example demonstrates the fundamental AllReduce collective operation using +NCCL's single-process, multi-GPU approach in which a single process manages all +GPUs to perform a sum reduction. + +## Overview + +AllReduce combines data from all participants using a reduction operation (sum, +max, min, etc.) and distributes the result to all participants. This example +shows how each GPU contributes its rank value and all GPUs receive the combined +sum using `ncclCommInitAll` for simplified setup. + +## What This Example Does + +1. **Detects available GPUs** and initializes NCCL communicators for all devices + using `ncclCommInitAll` +2. **Initializes data** with each GPU contributing its rank value (GPU 0→0, GPU + 1→1, etc.) +3. **Performs AllReduce sum operation** where all GPU values are summed and + distributed to all participants +4. **Verifies correctness** by checking that all GPUs received the expected sum: + 0+1+2+...+(n-1) + +## Building and Running + +### Build the Example +```bash +cd examples/03_collectives/01_allreduce +make [NCCL_HOME=] [CUDA_HOME=] +``` + +### Run with All Available GPUs +```bash +./allreduce +``` + +### Run with Specific GPUs +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 ./allreduce +``` + +## Code Walk-through + +### Data Initialization +Each GPU sets a send buffer allocated on the GPU to its rank value: +```cpp +float** sendbuff; +float rank_value = (float)i; +size_t size; // size is the number of float to be sent + +// Allocate device memory for send buffers +CUDACHECK(cudaMalloc((void **)&sendbuff[i], size * sizeof(float))); + +// Each GPU contributes its rank (GPU i contributes value i) +// Zero the entire buffer, then set first element to rank +CUDACHECK(cudaMemset(sendbuff[i], 0, size * sizeof(float))); +CUDACHECK(cudaMemcpy(sendbuff[i], &rank_value, sizeof(float), cudaMemcpyHostToDevice)); +``` + +### AllReduce Operation +All GPUs participate in the sum reduction. The operations are evaluated in parallel within a NCCL group to avoid any deadlocks. +```cpp +float** recvbuff; +ncclComm_t *comms; // comms are set during ncclCommInitAll +cudaStream_t *streams; // streams are set in cudaStreamCreate + +// Allocate device memory for receive buffers +CUDACHECK(cudaMalloc((void **)&recvbuff[i], size * sizeof(float))); + +NCCLCHECK(ncclGroupStart()); +for (int i = 0; i < num_gpus; i++) { + NCCLCHECK(ncclAllReduce(sendbuff[i], recvbuff[i], size, ncclFloat, + ncclSum, comms[i], streams[i])); +} +NCCLCHECK(ncclGroupEnd()); +``` + +## Expected Output + +``` +Using 4 devices for collective communication +Memory allocated for 4 communicators and streams +NCCL communicators initialized for all devices + Device 0 initialized with data value 0 + Device 1 initialized with data value 1 + Device 2 initialized with data value 2 + Device 3 initialized with data value 3 +Starting collective sum operation across all devices +Collective operation completed +Verifying results (expected sum: 6) + Device 0 correctly received sum: 6 + Device 1 correctly received sum: 6 + Device 2 correctly received sum: 6 + Device 3 correctly received sum: 6 +Example completed successfully! +``` + +## When to Use + +- **Deep learning**: Gradient averaging in data-parallel training +- **Scientific computing**: Global reductions in parallel algorithms +- **Statistics**: Computing global sums, averages, or other reductions +- **Distributed algorithms**: Any scenario requiring collective reduction + operations + +## Key Insights +- `ncclCommInitAll` simplifies single-node multi-GPU setup +- No MPI or pthreads needed for single-node patterns +- Allocate device buffer via ``cudaMalloc` and initialize via `cudaMemset`. +- Best practices to wrap all collective calls in ncclGroupStart/End +- All communication happens in parallel + +## Common Issues and Solutions + +### Issue: Verification failures +**Solution:** Ensure each GPU initializes its buffer correctly with its rank +value. + +### Issue: Out of memory errors +**Solution:** Reduce the buffer size in the code or use fewer GPUs. + +## Error Handling + +This example uses comprehensive error checking with `NCCLCHECK` and `CUDACHECK` +macros that immediately exit on any failure. In production code, consider more +graceful error handling and recovery mechanisms. + +## Next Steps + +After understanding AllReduce, explore: +- **Point-to-point communication**: Examples in `02_point_to_point/` +- **Other collectives**: Implement Broadcast, Reduce, AllGather operations using + this example +- **Multi-node approach**: Use the MPI implementation from `01_communicators` to + send data across nodes. + diff --git a/examples/03_collectives/01_allreduce/main.cc b/examples/03_collectives/01_allreduce/main.cc new file mode 100644 index 000000000..08fef3d20 --- /dev/null +++ b/examples/03_collectives/01_allreduce/main.cc @@ -0,0 +1,201 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "nccl.h" +#include +#include +#include +#include + +/* + * NCCL AllReduce Example - Collective Communication + * + * This example demonstrates the fundamental AllReduce collective operation + * using NCCL's single-process, multi-GPU approach. AllReduce is one of the most + * important collective operations in distributed and parallel computing. + * + * Learning Objectives: + * - Understand AllReduce collective communication pattern + * - Learn NCCL single-process multi-GPU programming model + * - See how data reduction works across multiple devices + * - Practice verification and validation of collective results + * + */ + +// Enhanced error checking macro for NCCL operations +// Provides detailed error information including the failed operation +#define NCCLCHECK(cmd) \ + do { \ + ncclResult_t res = cmd; \ + if (res != ncclSuccess) { \ + fprintf(stderr, "Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, \ + ncclGetErrorString(res)); \ + fprintf(stderr, "Failed NCCL operation: %s\n", #cmd); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define CUDACHECK(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + fprintf(stderr, "Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \ + cudaGetErrorString(err)); \ + fprintf(stderr, "Failed CUDA operation: %s\n", #cmd); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +int main(int argc, char *argv[]) { + // ======================================================================== + // STEP 1: Initialize Variables and Detect Available GPUs + // ======================================================================== + + int num_gpus = 0; + ncclComm_t *comms; + cudaStream_t *streams; + float **sendbuff; + float **recvbuff; + + // Get number of CUDA devices + CUDACHECK(cudaGetDeviceCount(&num_gpus)); + if (num_gpus < 1) { + printf("No CUDA devices found\n"); + return EXIT_FAILURE; + } + + printf("Using %d devices for collective communication\n", num_gpus); + + // ======================================================================== + // STEP 2: Allocate Memory for Communicators, Streams, and Data Buffers + // ======================================================================== + + // Allocate arrays for per-device resources, and array of pointers for buffers + comms = (ncclComm_t *)malloc(num_gpus * sizeof(ncclComm_t)); + streams = (cudaStream_t *)malloc(num_gpus * sizeof(cudaStream_t)); + sendbuff = (float **)malloc(num_gpus * sizeof(float *)); + recvbuff = (float **)malloc(num_gpus * sizeof(float *)); + + printf("Memory allocated for %d communicators and streams\n", num_gpus); + + // ======================================================================== + // STEP 3: Initialize NCCL Communicators for All Devices + // ======================================================================== + + // ncclCommInitAll creates communicators for all devices in one call + // This is the simplest way to set up NCCL for single-process applications + NCCLCHECK(ncclCommInitAll(comms, num_gpus, NULL)); + printf("NCCL communicators initialized for all devices\n"); + + // ======================================================================== + // STEP 4: Create CUDA Streams and Allocate Device Memory + // ======================================================================== + + const size_t size = 32 * 1024 * 1024; // 32M floats for demonstration + + for (int i = 0; i < num_gpus; i++) { + // Set device context for each GPU + CUDACHECK(cudaSetDevice(i)); + + // Create stream for asynchronous operations + CUDACHECK(cudaStreamCreate(&streams[i])); + + // Allocate device memory for send and receive buffers + CUDACHECK(cudaMalloc((void **)&sendbuff[i], size * sizeof(float))); + CUDACHECK(cudaMalloc((void **)&recvbuff[i], size * sizeof(float))); + + // Initialize send buffer: zero the entire buffer, then set first element to + // rank + CUDACHECK(cudaMemset(sendbuff[i], 0, size * sizeof(float))); + float rank_value = (float)i; + CUDACHECK(cudaMemcpy(sendbuff[i], &rank_value, sizeof(float), + cudaMemcpyHostToDevice)); + + printf(" Device %d initialized with data value %d\n", i, i); + } + + // ======================================================================== + // STEP 5: Perform AllReduce Sum Operation + // ======================================================================== + + printf("Starting collective sum operation across all devices\n"); + + // NOTE: ncclGroupStart and ncclGroupEnd are essential to avoid + // deadlock when using ncclCommInitAll and multiple communication calls. + NCCLCHECK(ncclGroupStart()); + for (int i = 0; i < num_gpus; i++) { + // Each device performs combines all contributions and distributes result + NCCLCHECK(ncclAllReduce(sendbuff[i], recvbuff[i], size, ncclFloat, ncclSum, + comms[i], streams[i])); + } + NCCLCHECK(ncclGroupEnd()); + + // Synchronize all streams to ensure completion + for (int i = 0; i < num_gpus; i++) { + CUDACHECK(cudaSetDevice(i)); + CUDACHECK(cudaStreamSynchronize(streams[i])); + } + + printf("Collective operation completed\n"); + + // ======================================================================== + // STEP 6: Verify Results and Validate Correctness + // ======================================================================== + + // Expected result: sum of all ranks = 0 + 1 + 2 + ... + (num_gpus-1) + // Note: We only check the first element since that's all we initialized + float expected = (float)(num_gpus * (num_gpus - 1) / 2); + printf("Verifying results (expected sum: %.0f)\n", expected); + + bool success = true; + for (int i = 0; i < num_gpus; i++) { + float result; + CUDACHECK(cudaSetDevice(i)); + CUDACHECK(cudaMemcpy(&result, recvbuff[i], sizeof(float), + cudaMemcpyDeviceToHost)); + + if (result != expected) { + printf(" Device %d received incorrect result: %.0f (expected %.0f)\n", i, + result, expected); + success = false; + } else { + printf(" Device %d correctly received sum: %.0f\n", i, result); + } + } + + // ======================================================================== + // STEP 7: Cleanup Resources and Report Results + // ======================================================================== + + // Destroy NCCL communicators + for (int i = 0; i < num_gpus; i++) { + ncclCommDestroy(comms[i]); + } + + // Free device memory and destroy streams + for (int i = 0; i < num_gpus; i++) { + CUDACHECK(cudaSetDevice(i)); + CUDACHECK(cudaFree(sendbuff[i])); + CUDACHECK(cudaFree(recvbuff[i])); + CUDACHECK(cudaStreamDestroy(streams[i])); + } + + // Free host memory + free(comms); + free(streams); + free(sendbuff); + free(recvbuff); + + if (success) { + printf("Example completed successfully!\n"); + } else { + printf("Example failed - incorrect results detected\n"); + return EXIT_FAILURE; + } + + return 0; +} diff --git a/examples/03_collectives/Makefile b/examples/03_collectives/Makefile new file mode 100644 index 000000000..c72163cba --- /dev/null +++ b/examples/03_collectives/Makefile @@ -0,0 +1,47 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +# NCCL Collective Examples +EXAMPLES = 01_allreduce + +# Default target +all: $(EXAMPLES) + +# Build individual examples +$(EXAMPLES): + $(MAKE) -C $@ + +# Clean all build artifacts +clean: + for example in $(EXAMPLES); do \ + $(MAKE) -C $$example clean; \ + done + +# Test all examples +test: all + for example in $(EXAMPLES); do \ + echo "Testing $$example..."; \ + $(MAKE) -C $$example test; \ + done + +# Help +help: + @echo "NCCL Collective Communication Examples" + @echo "=====================================" + @echo "" + @echo "Targets:" + @echo " all - Build all examples" + @echo " clean - Clean all build artifacts" + @echo " test - Test all examples" + @echo " help - Show this help" + @echo "" + @echo "Examples:" + @echo " 01_allreduce - AllReduce collective operation" + @echo "" + @echo "To build/run individual examples:" + @echo " make -C 01_allreduce" + +.PHONY: all clean test help $(EXAMPLES) diff --git a/examples/03_collectives/README.md b/examples/03_collectives/README.md new file mode 100644 index 000000000..468202380 --- /dev/null +++ b/examples/03_collectives/README.md @@ -0,0 +1,68 @@ + + +# NCCL Collective Communication Examples + +## Overview +This directory contains minimal examples that demonstrate NCCL collective +communication operations on a single node using a single process managing all +GPUs. The focus is clarity, correct resource management, and result +verification. + +## Examples + +### [01_allreduce](01_allreduce/) +**AllReduce Collective Operation** +- **Pattern**: All participants reduce and distribute the result +- **API**: `ncclCommInitAll`, `ncclAllReduce` +- **Use case**: Global reductions in ML and HPC (e.g., gradient averaging) +- **Key features**: + - Initializes all GPUs in a single process + - Each GPU contributes its rank value + - Executes AllReduce sum across all GPUs + - Verifies the expected global sum + +## Choosing the Right Pattern + +*Scenario* : Parallel training needs efficient global communication +*Addresses* : Most commonly used collective algorithms +*Dependencies* : A functional NCCL library and its dependencies + +### Why `ncclCommInitAll` here? +For single-node collective examples we use `ncclCommInitAll` as it creates a +clique of communicators in one call. + +```c +// Initialize all GPUs in one call +ncclComm_t* comms; +int num_gpus; +NCCLCHECK(ncclCommInitAll(comms, num_gpus, NULL)); +``` + +A more advanced setup using MPI to initialize communicators across multiple +nodes is shown in +[01_communicators/03_one_device_per_process_mpi](../01_communicators/03_one_device_per_process_mpi) + +## Building + +### **Quick Start** +```shell +# Build example by directory name +make 01_allreduce +``` + +### **Individual Examples** +```shell +# Build and run AllReduce +cd 01_allreduce && make +./allreduce +``` + +## References +- [NCCL User Guide: + Examples](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html) +- [NCCL API + Reference](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html) +- [CUDA Programming + Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/) diff --git a/examples/04_user_buffer_registration/01_allreduce/Makefile b/examples/04_user_buffer_registration/01_allreduce/Makefile new file mode 100644 index 000000000..014d17583 --- /dev/null +++ b/examples/04_user_buffer_registration/01_allreduce/Makefile @@ -0,0 +1,77 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +# Include common build rules +include ../../../makefiles/common.mk +include ../../../makefiles/examples.mk + +# Target executable +TARGET = allreduce_ub + +# Common utilities +COMMON_INC = ../../common/include +COMMON_SRC = ../../common/src + +# Build configuration +INCLUDES += -I$(COMMON_INC) + +# Source files +SOURCES = main.cc $(COMMON_SRC)/utils.cc +OBJECTS = $(SOURCES:.cc=.o) + +# Default target +all: $(TARGET) + +# Build executable +$(TARGET): $(OBJECTS) +ifeq ($(MPI),1) + $(MPICXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -o $@ +else + $(CXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -lpthread -o $@ +endif + @echo "Built target $@" + +# Compile source files +%.o: %.cc +ifeq ($(MPI),1) + $(MPICXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@ +else + $(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@ +endif + +# Test target +test: $(TARGET) + @echo "Testing $(TARGET)..." +ifeq ($(MPI),1) + @echo "Running with 2 processes" + $(MPIRUN) -np 2 ./$(TARGET) +else + @echo "Running with all available GPUs" + ./$(TARGET) +endif + +# Clean build artifacts +clean: + rm -f $(OBJECTS) $(TARGET) + +# Install target +install: $(TARGET) + @mkdir -p $(PREFIX)/bin + cp $(TARGET) $(PREFIX)/bin/ + +# Help +help: + @echo "NCCL Example: User Buffer Registration Allreduce" + @echo "==============================================" + @echo "" + @echo "Targets:" + @echo " all - Build the example (default)" + @echo " test - Build and run test with all GPUs" + @echo " clean - Remove build artifacts" + @echo " install - Install to PREFIX/bin (default: /usr/local/bin)" + @echo " help - Show this help" + +.PHONY: all test clean install help diff --git a/examples/04_user_buffer_registration/01_allreduce/README.md b/examples/04_user_buffer_registration/01_allreduce/README.md new file mode 100644 index 000000000..01e4b3467 --- /dev/null +++ b/examples/04_user_buffer_registration/01_allreduce/README.md @@ -0,0 +1,163 @@ + + +# NCCL User Buffer Registration AllReduce Example + +This example demonstrates how to use NCCL's user buffer registration feature to +optimize performance for repeated collective operations on the same buffers. +User Buffer Registration is a feature that allows NCCL to directly +send/receive/operate data through the user buffer without extra internal copy +(zero-copy). + +## Overview + +User buffer registration allows NCCL to pre-register memory buffers with +communicators, eliminating registration overhead on each operation. This is +particularly beneficial for applications that repeatedly perform collective +operations on the same memory regions, such as iterative training loops. + +## What This Example Does + +1. **Allocates memory using NCCL allocator** (`ncclMemAlloc`) which is provided + by NCCL as convenience function +2. **Registers buffers with communicator** using `ncclCommRegister` for + optimized performance +3. **Performs AllReduce sum operation** using the registered buffers for + efficient communication + +## Building and Running + +The advanced examples can be built using either pthread or MPI for +parallelization. pthread is the default choice. To use MPI the user needs to +provide a valid MPI installation under `MPI_HOME`. + +### Build +```shell +make [MPI=1] [MPI_HOME=] [NCCL_HOME=] [CUDA_HOME=] +``` + +### Run when compiled for pthreads (default) +```shell +[NTHREADS=N] ./allreduce_ub +``` + +### Run when compiled for MPI +```shell +mpirun -np ./allreduce_ub +``` + +## Code Structure + +### Key Components + +1. **Buffer Allocation and Registration**: +```c +size_t size_bytes; // Is set to the size of the send/receive buffers +void *d_sendbuff; +void *d_recvbuff; + +// Allocate buffers using ncclMemAlloc (or another qualified allocator) on the device +NCCLCHECK(ncclMemAlloc(&d_sendbuff, size_bytes)); +NCCLCHECK(ncclMemAlloc(&d_recvbuff, size_bytes)); + +ncclComm_t comm; // comms is set during ncclCommInitRank +void *send_handle; +void *recv_handle; + +// Register buffers with NCCL, handle is returned for De-registration +NCCLCHECK(ncclCommRegister(comm, d_sendbuff, size_bytes, &send_handle)); +NCCLCHECK(ncclCommRegister(comm, d_recvbuff, size_bytes, &recv_handle)); +``` + +2. **AllReduce with Group Operations**: +```c +size_t count; // set to number of floats to exchange +cudaStream_t stream; // stream is set in cudaStreamCreate + +NCCLCHECK(ncclAllReduce(d_sendbuff, d_recvbuff, count, ncclFloat, ncclSum, + comm, stream)); +``` + +3. **Buffer Deregistration and Cleanup**: +```c +// Deregister buffers using handle from ncclCommRegister +NCCLCHECK(ncclCommDeregister(comm, send_handle)); +NCCLCHECK(ncclCommDeregister(comm, recv_handle)); + +// Free buffers allocated with ncclMemAlloc +NCCLCHECK(ncclMemFree(d_sendbuff)); +NCCLCHECK(ncclMemFree(d_recvbuff)); + +``` + +## Expected Output + +### With 4 GPUs (using pthreads/MPI) +``` +Starting AllReduce example with 4 ranks + Rank 0 communicator initialized using device 0 + Rank 1 communicator initialized using device 1 + Rank 2 communicator initialized using device 2 + Rank 3 communicator initialized using device 3 +User Buffer allocation: + Rank 0 allocating 4.00 MB per buffer + Rank 1 allocating 4.00 MB per buffer + Rank 2 allocating 4.00 MB per buffer + Rank 3 allocating 4.00 MB per buffer + Rank 0 data initialized (value: 0) + Rank 1 data initialized (value: 1) + Rank 2 data initialized (value: 2) + Rank 3 data initialized (value: 3) +Starting AllReduce with 1048576 elements (4 MB) +AllReduce completed successfully +Verification - Expected: 6.0, Got: 6.0 +Results verified correctly + Rank 0 buffers deregistered + Rank 1 buffers deregistered + Rank 2 buffers deregistered + Rank 3 buffers deregistered +All resources cleaned up successfully +``` + +## Performance Benefits of User Buffer Registration + +User buffer registration provides several performance advantages: + +1. **Reduced Overhead**: Pre-registration eliminates the need to + register/deregister buffers for each operation +2. **Better Memory Pinning**: Registered buffers are pinned in memory, + preventing page faults +3. **Lower Latency**: Especially beneficial for repeated operations on the same + buffers + +**Important**: Buffers must be allocated with `ncclMemAlloc` or a compatible +allocator for registration to work. See The [General Buffer Registration +](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html#general-buffer-registration) +section of the user guide. + +**Important**: If any rank in a communicator passes registered buffers to a NCCL +communication function, all other ranks in the same communicator must pass their +registered buffers; otherwise, mixing registered and non-registered buffers can +result in undefined behavior. + +## Key Insights + +- **User Buffer Registration** is most beneficial for: + - Large data transfers + - Repeated operations on the same buffers + - Performance-critical applications +- **Memory management** is critical - always deregister buffers before freeing + +## Common Issues and Solutions + +1. **Registration Failure**: Buffers MUST be allocated with `ncclMemAlloc` or + another qualified allocator (not `cudaMalloc`) for registration. See [Buffer + Registration](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html) + section for details. +2. **Allocation Error**: If `ncclMemAlloc` fails, check NCCL version (requires + 2.19.x+) and available memory +3. **Deregistration Order**: Always deregister before freeing memory +4. **Handle Management**: Keep track of registration handles for proper cleanup +5. **Memory Leaks**: Always use `ncclMemFree` for buffers allocated with + `ncclMemAlloc` diff --git a/examples/04_user_buffer_registration/01_allreduce/main.cc b/examples/04_user_buffer_registration/01_allreduce/main.cc new file mode 100644 index 000000000..105b4e68e --- /dev/null +++ b/examples/04_user_buffer_registration/01_allreduce/main.cc @@ -0,0 +1,214 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "nccl.h" +#include "utils.h" +#include +#include +#include +#include +#include +#include + +/* + * NCCL User Buffer Registration AllReduce Example + * + * This example demonstrates how to use NCCL's user buffer registration feature + * to optimize performance for repeated collective operations on the same + * buffers. + * + * Learning Objectives: + * - Learn how to register and deregister buffers with NCCL communicators + * - See the proper lifecycle management of registered buffers + * + */ + +/* + * This function can be called inside an MPI rank or pthread thread. The + * initialization and broadcast are implemented in common/src/utils.cc for + * easier readability. For fully integrated examples using pthreads or MPI see + * examples in 01_communicators. + */ +void *allReduce(int my_rank, int total_ranks, int local_device, + int devices_per_rank) { + + // ======================================================================== + // STEP 1: Initialize NCCL Communicator and Setup + // ======================================================================== + + ncclUniqueId nccl_unique_id; + if (my_rank == 0) { + printf("Starting AllReduce example with %d ranks\n", total_ranks); + NCCLCHECK(ncclGetUniqueId(&nccl_unique_id)); + } + + // Distribute unique ID. + // This step ensures all ranks have the same unique ID for communicator + // creation + util_broadcast(0, my_rank, &nccl_unique_id); + + // Set device context for this rank + // Each rank manages its assigned GPU device + CUDACHECK(cudaSetDevice(local_device)); + + // Initialize NCCL communicator + // This creates the communication context for collective operations + ncclComm_t comm; + NCCLCHECK(ncclCommInitRank(&comm, total_ranks, nccl_unique_id, my_rank)); + printf(" Rank %d communicator initialized using device %d\n", my_rank, + local_device); + + // ======================================================================== + // STEP 2: Allocate Memory Using NCCL Allocator + // ======================================================================== + + if (my_rank == 0) { + printf("User Buffer allocation:\n"); + } + // Allocate memory - using larger buffers to demonstrate registration + // benefits + size_t count = 1024 * 1024; // 1M elements + size_t size_bytes = count * sizeof(float); + + printf(" Rank %d allocating %.2f MB per buffer\n", my_rank, + (float)size_bytes / (1024 * 1024)); + + // Allocate buffers using NCCL allocator + // NCCL's allocator can provide optimized memory for communication + void *d_sendbuff; + void *d_recvbuff; + NCCLCHECK(ncclMemAlloc(&d_sendbuff, size_bytes)); + NCCLCHECK(ncclMemAlloc(&d_recvbuff, size_bytes)); + + // ======================================================================== + // STEP 3: Register Buffers with NCCL Communicator + // ======================================================================== + + // Register the buffers with NCCL + // This is the key optimization - buffers are pre-registered for efficiency + // The handles returned can be used to identify registered buffers + void *send_handle; + void *recv_handle; + NCCLCHECK(ncclCommRegister(comm, d_sendbuff, size_bytes, &send_handle)); + NCCLCHECK(ncclCommRegister(comm, d_recvbuff, size_bytes, &recv_handle)); + + // ======================================================================== + // STEP 4: Initialize Data and Prepare for Communication + // ======================================================================== + + // Initialize data - each rank contributes its rank value + // This creates a simple test pattern for verification + float *h_data = (float *)malloc(size_bytes); + for (size_t i = 0; i < count; i++) { + h_data[i] = (float)my_rank; + } + CUDACHECK(cudaMemcpy(d_sendbuff, h_data, size_bytes, cudaMemcpyHostToDevice)); + printf(" Rank %d data initialized (value: %d)\n", my_rank, my_rank); + + // Create stream for asynchronous operations + // Streams allow overlapping computation and communication + cudaStream_t stream; + CUDACHECK(cudaStreamCreate(&stream)); + + // ======================================================================== + // STEP 5: Perform AllReduce Operation + // ======================================================================== + + if (my_rank == 0) { + printf("Starting AllReduce with %zu elements (%zu MB)\n", count, + size_bytes / (1024 * 1024)); + } + + // Perform AllReduce operation + // Since buffers are registered, this should have optimized performance + NCCLCHECK(ncclAllReduce(d_sendbuff, d_recvbuff, count, ncclFloat, ncclSum, + comm, stream)); + + if (my_rank == 0) { + printf("AllReduce completed successfully\n"); + } + + // ======================================================================== + // STEP 6: Verify Results and Validate Correctness + // ======================================================================== + + // Synchronize to ensure completion + CUDACHECK(cudaStreamSynchronize(stream)); + + // Verify results (optional - copy back and check a few elements) + float *h_result = (float *)malloc(sizeof(float) * count); + CUDACHECK(cudaMemcpy(h_result, d_recvbuff, sizeof(float) * count, + cudaMemcpyDeviceToHost)); + + // Each element should be the sum of all ranks + float expected_sum = (float)(total_ranks * (total_ranks - 1)) / 2; + bool all_ok = true; + if (my_rank == 0) { + printf("Verification - Expected: %.1f, Got: %.1f\n", expected_sum, + h_result[0]); + + for (size_t i = 1; i < count; i++) { + if (fabsf(h_result[i] - expected_sum) > 0.001) { + printf(" Results verification failed at index %zu: Expected %.1f, Got " + "%.1f\n", + i, expected_sum, h_result[i]); + all_ok = false; + break; + } + } + + if (all_ok) { + printf("Results verified correctly\n"); + } else { + printf("Results verification failed\n"); + } + } + + // ======================================================================== + // STEP 7: Cleanup and Resource Management + // ======================================================================== + + // Important: Cleanup must happen in the correct order + // 1. Free host memory + // 2. Deregister buffers from communicator + // 3. Free device memory + // 4. Destroy CUDA resources + // 5. Finalize and destroy NCCL communicator + + free(h_data); + free(h_result); + + // Deregister buffers from communicator + // This must happen before freeing the buffers or destroying the + // communicator + NCCLCHECK(ncclCommDeregister(comm, send_handle)); + NCCLCHECK(ncclCommDeregister(comm, recv_handle)); + printf(" Rank %d buffers deregistered\n", my_rank); + + // Free device memory allocated by NCCL + NCCLCHECK(ncclMemFree(d_sendbuff)); + NCCLCHECK(ncclMemFree(d_recvbuff)); + + // Destroy CUDA stream + CUDACHECK(cudaStreamDestroy(stream)); + + // Finalize and destroy NCCL communicator + NCCLCHECK(ncclCommFinalize(comm)); + NCCLCHECK(ncclCommDestroy(comm)); + + if (my_rank == 0) { + printf("All resources cleaned up successfully\n"); + } + + return NULL; +} + +int main(int argc, char *argv[]) { + // Run example using the standard test framework + // This handles MPI/pthread initialization, device assignment, and cleanup + return run_example(argc, argv, allReduce); +} diff --git a/examples/04_user_buffer_registration/Makefile b/examples/04_user_buffer_registration/Makefile new file mode 100644 index 000000000..900074ab4 --- /dev/null +++ b/examples/04_user_buffer_registration/Makefile @@ -0,0 +1,47 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +# NCCL User Buffer Examples +EXAMPLES = 01_allreduce + +# Default target +all: $(EXAMPLES) + +# Build individual examples +$(EXAMPLES): + $(MAKE) -C $@ + +# Clean all build artifacts +clean: + for example in $(EXAMPLES); do \ + $(MAKE) -C $$example clean; \ + done + +# Test all examples +test: all + for example in $(EXAMPLES); do \ + echo "Testing $$example..."; \ + $(MAKE) -C $$example test; \ + done + +# Help +help: + @echo "NCCL User Buffer Registration Examples" + @echo "======================================" + @echo "" + @echo "Targets:" + @echo " all - Build all examples" + @echo " clean - Clean all build artifacts" + @echo " test - Test all examples" + @echo " help - Show this help" + @echo "" + @echo "Examples:" + @echo " 01_allreduce - AllReduce collective operation" + @echo "" + @echo "To build/run individual examples:" + @echo " make -C 01_allreduce" + +.PHONY: all clean test help $(EXAMPLES) diff --git a/examples/04_user_buffer_registration/README.md b/examples/04_user_buffer_registration/README.md new file mode 100644 index 000000000..f848cf140 --- /dev/null +++ b/examples/04_user_buffer_registration/README.md @@ -0,0 +1,73 @@ + + +# NCCL User Buffer Registration Examples + +## Overview +This directory contains minimal examples that demonstrate NCCL user buffer +registration for improving performance by allowing NCCL to operate directly on +user-allocated buffers. + +## Examples + +### [01_allreduce](01_allreduce/) +**AllReduce with User Buffer Registration** +- **Pattern**: Register communication buffers once and reuse across operations +- **API**: `ncclCommRegister`, `ncclCommDeregister`, `ncclMemAlloc`, + `ncclAllReduce` +- **Use case**: Repeated collectives on the same buffers; performance-critical + workloads +- **Key features**: + - Buffers allocated via `ncclMemAlloc` for registration compatibility + - Registration handles managed explicitly (register → use → deregister) + - Collective operations executed on registered buffers + - Correct cleanup and verification + +## Choosing the Right Pattern + +*Scenario* : Optimize performance for repeated collectives on same buffers +*Addresses* : Throughput-sensitive training loops +*Dependencies* : pthread or MPI + +### Why Buffer Registration? +Pre-registering buffers eliminates per-call registration overhead and enables +direct access. It can accelerate collectives and greatly reduce the resource +usage (e.g. #channel usage). Also, this is a prerequisite for advanced features +such as symmetric memory or device API calls. + +```c +// Allocate using NCCL convenience function and register buffers +NCCLCHECK(ncclMemAlloc((void**)&d_send, size_bytes)); +NCCLCHECK(ncclCommRegister(comm, d_send, size_bytes, &send_handle)); + +// Use in collectives +NCCLCHECK(ncclAllReduce(d_send, d_recv, count, ncclFloat, ncclSum, comm, stream)); + +// Deregister and free +NCCLCHECK(ncclCommDeregister(comm, send_handle)); +NCCLCHECK(ncclMemFree(d_send)); +``` + +## Building + +### **Quick Start** +```shell +# Build example by directory name +make 01_allreduce +``` + +### **Individual Examples** +```shell +# Build and run AllReduce with user buffer registration +cd 01_allreduce && make +./allreduce_ub +``` + +## References +- [NCCL User Guide: + Examples](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html) +- [NCCL API + Reference](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html) +- [CUDA Programming + Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/) diff --git a/examples/05_symmetric_memory/01_allreduce/Makefile b/examples/05_symmetric_memory/01_allreduce/Makefile new file mode 100644 index 000000000..20c8ad9ad --- /dev/null +++ b/examples/05_symmetric_memory/01_allreduce/Makefile @@ -0,0 +1,77 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +# Include common build rules +include ../../../makefiles/common.mk +include ../../../makefiles/examples.mk + +# Target executable +TARGET = allreduce_sm + +# Common utilities +COMMON_INC = ../../common/include +COMMON_SRC = ../../common/src + +# Build configuration +INCLUDES += -I$(COMMON_INC) + +# Source files +SOURCES = main.cc $(COMMON_SRC)/utils.cc +OBJECTS = $(SOURCES:.cc=.o) + +# Default target +all: $(TARGET) + +# Build executable +$(TARGET): $(OBJECTS) +ifeq ($(MPI),1) + $(MPICXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -o $@ +else + $(CXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -lpthread -o $@ +endif + @echo "Built target $@" + +# Compile source files +%.o: %.cc +ifeq ($(MPI),1) + $(MPICXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@ +else + $(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@ +endif + +# Test target +test: $(TARGET) + @echo "Testing $(TARGET)..." +ifeq ($(MPI),1) + @echo "Running with 2 processes" + $(MPIRUN) -np 2 ./$(TARGET) +else + @echo "Running with all available GPUs" + ./$(TARGET) +endif + +# Clean build artifacts +clean: + rm -f $(OBJECTS) $(TARGET) + +# Install target +install: $(TARGET) + @mkdir -p $(PREFIX)/bin + cp $(TARGET) $(PREFIX)/bin/ + +# Help +help: + @echo "NCCL Example: Symmetric Memeory Allreduce" + @echo "==============================================" + @echo "" + @echo "Targets:" + @echo " all - Build the example (default)" + @echo " test - Build and run test with all GPUs" + @echo " clean - Remove build artifacts" + @echo " install - Install to PREFIX/bin (default: /usr/local/bin)" + @echo " help - Show this help" + +.PHONY: all test clean install help diff --git a/examples/05_symmetric_memory/01_allreduce/README.md b/examples/05_symmetric_memory/01_allreduce/README.md new file mode 100644 index 000000000..b307b462a --- /dev/null +++ b/examples/05_symmetric_memory/01_allreduce/README.md @@ -0,0 +1,165 @@ + + +# NCCL Symmetric Memory AllReduce Example + +This example demonstrates how to use NCCL's symmetric memory feature for +optimized collective operations. Symmetric memory provides optimized performance +by leveraging consistent memory layouts across all participating ranks, enabling +advanced communication algorithms. + +## Overview + +Symmetric memory windows provide a way to register memory buffers that benefit +from optimized collective operations. When using `NCCL_WIN_COLL_SYMMETRIC`, all +ranks must provide symmetric buffers, enabling optimized communication patterns +and better performance for large-scale multi-GPU operations. + +## What This Example Does + +1. **Allocates memory using NCCL allocator** (`ncclMemAlloc`) which provides + memory compatible with symmetric windows +2. **Registers buffers as symmetric windows** using `ncclCommWindowRegister` + with `NCCL_WIN_COLL_SYMMETRIC` flag +3. **Performs AllReduce sum operation** using the symmetric memory for optimized + communication performance + +## Building and Running + +The advanced examples can be built using either pthread or MPI for +parallelization. pthread is the default choice. To use MPI the user needs to set +`MPI=1` at build time and can optionally provide a valid MPI installation under +`MPI_HOME`. + +### Build +```shell +make [MPI=1] [MPI_HOME=] [NCCL_HOME=] [CUDA_HOME=] +``` + +### Run when compiled for pthreads (default) +```shell +[NTHREADS=N] ./allreduce_sm +``` + +### Run when compiled for MPI +```shell +mpirun -np ./allreduce_sm +``` + +## Code Structure + +### Key Components + +1. **Buffer Allocation and Window Registration**: +```c +size_t size_bytes; // Is set to the size of the send/receive buffers +void *d_sendbuff; +void *d_recvbuff; + +// Allocate buffers using ncclMemAlloc (compatible with symmetric memory) +NCCLCHECK(ncclMemAlloc(&d_sendbuff, size_bytes)); +NCCLCHECK(ncclMemAlloc(&d_recvbuff, size_bytes)); + +ncclComm_t comm; +ncclWindow_t send_win; +ncclWindow_t recv_win; + +// Register buffers as symmetric windows +NCCLCHECK(ncclCommWindowRegister(comm, d_sendbuff, size_bytes, &send_win, NCCL_WIN_COLL_SYMMETRIC)); +NCCLCHECK(ncclCommWindowRegister(comm, d_recvbuff, size_bytes, &recv_win, NCCL_WIN_COLL_SYMMETRIC)); +``` + +2. **AllReduce Operation**: +```c +size_t count; // set to number of floats to exchange +cudaStream_t stream; // stream is set in cudaStreamCreate + +// Perform AllReduce with symmetric memory optimization +NCCLCHECK(ncclAllReduce(d_sendbuff, d_recvbuff, count, ncclFloat, ncclSum, + comm, stream)); +``` + +3. **Window Deregistration and Cleanup**: +```c +// Deregister symmetric memory windows +NCCLCHECK(ncclCommWindowDeregister(comm, send_win)); +NCCLCHECK(ncclCommWindowDeregister(comm, recv_win)); + +// Free buffers allocated with ncclMemAlloc +NCCLCHECK(ncclMemFree(d_sendbuff)); +NCCLCHECK(ncclMemFree(d_recvbuff)); +``` + +## Expected Output + +### With 4 GPUs (using pthreads/MPI) +``` +Starting AllReduce example with 4 ranks + Rank 0 communicator initialized using device 0 + Rank 1 communicator initialized using device 1 + Rank 2 communicator initialized using device 2 + Rank 3 communicator initialized using device 3 +Symmetric Memory allocation + Rank 0 allocating 4.00 MB per buffer + Rank 1 allocating 4.00 MB per buffer + Rank 2 allocating 4.00 MB per buffer + Rank 3 allocating 4.00 MB per buffer + Rank 0 data initialized (value: 0) + Rank 1 data initialized (value: 1) + Rank 2 data initialized (value: 2) + Rank 3 data initialized (value: 3) +Starting AllReduce with 1048576 elements (4 MB) +AllReduce completed successfully +Verification - Expected: 6.0, Got: 6.0 +Results verified correctly + Rank 0 symmetric memory windows deregistered + Rank 1 symmetric memory windows deregistered + Rank 2 symmetric memory windows deregistered + Rank 3 symmetric memory windows deregistered +All resources cleaned up successfully +Example completed - demonstrated symmetric memory lifecycle +``` + +## Performance Benefits of Symmetric Memory + +Symmetric memory registration provides several performance advantages: + +- **Optimized Communication Algorithms**: NCCL can apply advanced optimizations + when all ranks have symmetric layouts +- **Better Memory Access Patterns**: Consistent layouts enable better caching + and memory access optimization + +For more information on the performance benefits see the [Enabling Fast +Inference and Resilient Training with NCCL +2.27]()https://developer.nvidia.com/blog/enabling-fast-inference-and-resilient-training-with-nccl-2-27/) +blog. + +**Important**: Buffers must be allocated using the CUDA Virtual Memory +Management (VMM) API. NCCL provides the `ncclMemAlloc` convenience function for +symmetric memory registration. The `NCCL_WIN_COLL_SYMMETRIC` flag requires all +ranks to provide symmetric buffers consistently. + +## Key Insights + +- **Symmetric Memory Windows** are most beneficial for: + - Large-scale collective operations with consistent memory patterns + - Latency-sensitive kernels + - Applications with predictable allocation patterns +- **ncclCommInitRank** can be used for pthread or MPI parallel case +- **Window registration** must happen on all ranks for collective operations +- **Memory management** is critical - always deregister windows before freeing + memory + +## Common Issues and Solutions + +1. **Window Registration Failure**: Buffers MUST be allocated with (VMM) API, + e.g. `ncclMemAlloc` (not `cudaMalloc`) for symmetric memory. +2. **Allocation Error**: If `ncclMemAlloc` fails, check NCCL version (requires + at least v2.27) and available memory +3. **Deregistration Order**: Always deregister windows before freeing memory or + destroying communicators +4. **Symmetric Requirement**: All ranks must use `NCCL_WIN_COLL_SYMMETRIC` + consistently in collective operations +5. **Memory Leaks**: Always use `ncclMemFree` for buffers allocated with + `ncclMemAlloc` diff --git a/examples/05_symmetric_memory/01_allreduce/main.cc b/examples/05_symmetric_memory/01_allreduce/main.cc new file mode 100644 index 000000000..f2cda155e --- /dev/null +++ b/examples/05_symmetric_memory/01_allreduce/main.cc @@ -0,0 +1,220 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "nccl.h" +#include "utils.h" +#include +#include +#include +#include +#include +#include + +/* + * NCCL Symmetric Memory AllReduce Example + * + * This example demonstrates how to use NCCL's symmetric memory feature + * for collective operations. Symmetric memory provides optimized performance + * by leveraging consistent memory layouts across all participating ranks. + * + * Learning Objectives: + * - Learn how to register symmetric memory windows with NCCL communicators + * - See the proper lifecycle management of symmetric memory + * + */ + +/* + * This function can be called inside an MPI rank or pthread thread. The + * initialization and broadcast are implemented in common/src/utils.cc for + * easier readability. For fully integrated examples using pthreads or MPI see + * examples in 01_communicators. + */ +void *allReduce(int my_rank, int total_ranks, int local_device, + int devices_per_rank) { + + // ======================================================================== + // STEP 1: Initialize NCCL Communicator and Setup + // ======================================================================== + + ncclUniqueId nccl_unique_id; + if (my_rank == 0) { + printf("Starting AllReduce example with %d ranks\n", total_ranks); + NCCLCHECK(ncclGetUniqueId(&nccl_unique_id)); + } + + // Distribute unique ID. + // This step ensures all ranks have the same unique ID for communicator + // creation + util_broadcast(0, my_rank, &nccl_unique_id); + + // Set device context for this rank + // Each rank manages its assigned GPU device + CUDACHECK(cudaSetDevice(local_device)); + + // Initialize NCCL communicator + // This creates the communication context for collective operations + ncclComm_t comm; + NCCLCHECK(ncclCommInitRank(&comm, total_ranks, nccl_unique_id, my_rank)); + printf(" Rank %d communicator initialized using device %d\n", my_rank, + local_device); + + // ======================================================================== + // STEP 2: Allocate Memory Using NCCL Allocator + // ======================================================================== + + if (my_rank == 0) { + printf("Symmetric Memory allocation\n"); + } + // Allocate memory - using larger buffers to demonstrate symmetric memory + // benefits + size_t count = 1024 * 1024; // 1M elements + size_t size_bytes = count * sizeof(float); + + printf(" Rank %d allocating %.2f MB per buffer\n", my_rank, + (float)size_bytes / (1024 * 1024)); + + float *h_data = (float *)malloc(size_bytes); + + // Allocate buffers using NCCL allocator + // NCCL's allocator is compatible with symmetric memory layouts + void *d_sendbuff; + void *d_recvbuff; + NCCLCHECK(ncclMemAlloc(&d_sendbuff, size_bytes)); + NCCLCHECK(ncclMemAlloc(&d_recvbuff, size_bytes)); + + // ======================================================================== + // STEP 3: Register Symmetric Memory Windows + // ======================================================================== + + /* Passing NCCL_WIN_COLL_SYMMETRIC requires users to provide the symmetric + * buffers among all ranks in collectives. + * Every rank needs to call ncclCommWindowRegister to register its buffers. + */ + + // Register symmetric memory windows with NCCL + ncclWindow_t send_win; + ncclWindow_t recv_win; + NCCLCHECK(ncclCommWindowRegister(comm, d_sendbuff, size_bytes, &send_win, + NCCL_WIN_COLL_SYMMETRIC)); + NCCLCHECK(ncclCommWindowRegister(comm, d_recvbuff, size_bytes, &recv_win, + NCCL_WIN_COLL_SYMMETRIC)); + + // ======================================================================== + // STEP 4: Initialize Data and Prepare for Communication + // ======================================================================== + + // Initialize data - each rank contributes its rank value + // This creates a simple test pattern for verification + for (size_t i = 0; i < count; i++) { + h_data[i] = (float)my_rank; + } + CUDACHECK(cudaMemcpy(d_sendbuff, h_data, size_bytes, cudaMemcpyHostToDevice)); + printf(" Rank %d data initialized (value: %d)\n", my_rank, my_rank); + + // Create stream for asynchronous operations + // Streams allow overlapping computation and communication + cudaStream_t stream; + CUDACHECK(cudaStreamCreate(&stream)); + + // ======================================================================== + // STEP 5: Perform AllReduce Operation + // ======================================================================== + + if (my_rank == 0) { + printf("Starting AllReduce with %zu elements (%zu MB)\n", count, + size_bytes / (1024 * 1024)); + } + + // Perform AllReduce operation + // Since symmetric memory is registered, NCCL can apply optimized algorithms + NCCLCHECK(ncclAllReduce(d_sendbuff, d_recvbuff, count, ncclFloat, ncclSum, + comm, stream)); + + if (my_rank == 0) { + printf("AllReduce completed successfully\n"); + } + + // ======================================================================== + // STEP 6: Verify Results and Validate Correctness + // ======================================================================== + + // Synchronize to ensure completion + CUDACHECK(cudaStreamSynchronize(stream)); + + // Verify results (optional - copy back and check) + float *h_result = (float *)malloc(size_bytes); + CUDACHECK(cudaMemcpy(h_result, d_recvbuff, size_bytes, + cudaMemcpyDeviceToHost)); + + // Each element should be the sum of all ranks + float expected_sum = (float)(total_ranks * (total_ranks - 1)) / 2; + bool all_ok = true; + if (my_rank == 0) { + printf("Verification - Expected: %.1f, Got: %.1f\n", expected_sum, + h_result[0]); + + for (size_t i = 1; i < count; i++) { + if (fabsf(h_result[i] - expected_sum) > 0.001) { + printf(" Results verification failed at index %zu: Expected %.1f, Got " + "%.1f\n", i, expected_sum, h_result[i]); + all_ok = false; + break; + } + } + + if (all_ok) { + printf("Results verified correctly\n"); + } else { + printf("Results verification failed\n"); + } + } + + // ======================================================================== + // STEP 7: Cleanup and Resource Management + // ======================================================================== + + // Important: Cleanup must happen in the correct order + // 1. Free host memory + // 2. Deregister symmetric memory windows + // 3. Free device memory + // 4. Destroy CUDA resources + // 5. Finalize and destroy NCCL communicator + + free(h_data); + free(h_result); + + // Deregister symmetric memory windows from communicator + // This must happen before freeing the buffers or destroying the + // communicator + NCCLCHECK(ncclCommWindowDeregister(comm, send_win)); + NCCLCHECK(ncclCommWindowDeregister(comm, recv_win)); + printf(" Rank %d symmetric memory windows deregistered\n", my_rank); + + // Free device memory allocated by NCCL + NCCLCHECK(ncclMemFree(d_sendbuff)); + NCCLCHECK(ncclMemFree(d_recvbuff)); + + // Destroy CUDA stream + CUDACHECK(cudaStreamDestroy(stream)); + + // Finalize and destroy NCCL communicator + NCCLCHECK(ncclCommFinalize(comm)); + NCCLCHECK(ncclCommDestroy(comm)); + + if (my_rank == 0) { + printf("All resources cleaned up successfully\n"); + printf("Example completed - demonstrated symmetric memory lifecycle\n"); + } + + return NULL; +} + +int main(int argc, char *argv[]) { + // Run example using the standard test framework + // This handles MPI/pthread initialization, device assignment, and cleanup + return run_example(argc, argv, allReduce); +} diff --git a/examples/05_symmetric_memory/Makefile b/examples/05_symmetric_memory/Makefile new file mode 100644 index 000000000..c2c5ce506 --- /dev/null +++ b/examples/05_symmetric_memory/Makefile @@ -0,0 +1,47 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +# NCCL Shared Memory Examples +EXAMPLES = 01_allreduce + +# Default target +all: $(EXAMPLES) + +# Build individual examples +$(EXAMPLES): + $(MAKE) -C $@ + +# Clean all build artifacts +clean: + for example in $(EXAMPLES); do \ + $(MAKE) -C $$example clean; \ + done + +# Test all examples +test: all + for example in $(EXAMPLES); do \ + echo "Testing $$example..."; \ + $(MAKE) -C $$example test; \ + done + +# Help +help: + @echo "NCCL Symmetric Memeory Examples" + @echo "===============================" + @echo "" + @echo "Targets:" + @echo " all - Build all examples" + @echo " clean - Clean all build artifacts" + @echo " test - Test all examples" + @echo " help - Show this help" + @echo "" + @echo "Examples:" + @echo " 01_allreduce - AllReduce collective operation" + @echo "" + @echo "To build/run individual examples:" + @echo " make -C 01_allreduce" + +.PHONY: all clean test help $(EXAMPLES) diff --git a/examples/05_symmetric_memory/README.md b/examples/05_symmetric_memory/README.md new file mode 100644 index 000000000..936ce4b78 --- /dev/null +++ b/examples/05_symmetric_memory/README.md @@ -0,0 +1,72 @@ + + +# NCCL Symmetric Memory Examples + +## Overview +This directory contains minimal examples that demonstrate NCCL symmetric memory +windows for improving performance of collective operations when all ranks use +consistent memory layouts. + +## Examples + +### [01_allreduce](01_allreduce/) +**AllReduce with Symmetric Memory Windows** +- **Pattern**: Register symmetric windows per rank and use them for collectives +- **API**: `ncclCommWindowRegister`, `ncclCommWindowDeregister`, `ncclMemAlloc`, + `ncclAllReduce` +- **Use case**: Large-scale collectives with consistent buffer layouts across + ranks +- **Key features**: + - Buffers allocated via `ncclMemAlloc` for symmetric compatibility + - Windows registered as `NCCL_WIN_COLL_SYMMETRIC` + - Collective operations executed on symmetric windows + - Correct deregistration and cleanup + +## Choosing the Right Pattern + +*Scenario* : Large-scale training with consistent memory patterns +*Addresses* : Low-latency, high-bandwidth collectives on supported systems +*Dependencies* : pthread or MPI + +### Why Symmetric Windows? +Symmetric windows enable NCCL to apply optimized collective protocols when all +ranks use consistent layouts. The memory needs to be allocated through the CUDA +Virtual Memory Management (VMM) API and registered with NCCL. + +```c +// Allocate using NCCL provided convenience function and register symmetric windows +NCCLCHECK(ncclMemAlloc(&buffer, size_bytes)); +NCCLCHECK(ncclCommWindowRegister(comm, buffer, size_bytes, &win, NCCL_WIN_COLL_SYMMETRIC)); + +// Collective using symmetric windows +NCCLCHECK(ncclAllReduce(buffer, buffer, count, ncclFloat, ncclSum, comm, stream)); + +// Deregister and free +NCCLCHECK(ncclCommWindowDeregister(comm, win)); +NCCLCHECK(ncclMemFree(buffer)); +``` + +## Building + +### **Quick Start** +```shell +# Build example by directory name +make 01_allreduce +``` + +### **Individual Examples** +```shell +# Build and run AllReduce with symmetric windows +cd 01_allreduce && make +./allreduce_sm +``` + +## References +- [NCCL User Guide: + Examples](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html) +- [NCCL API + Reference](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html) +- [CUDA Programming + Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/) diff --git a/examples/06_device_api/01_allreduce/Makefile b/examples/06_device_api/01_allreduce/Makefile new file mode 100644 index 000000000..60b21c8bc --- /dev/null +++ b/examples/06_device_api/01_allreduce/Makefile @@ -0,0 +1,81 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +# Include common build rules +include ../../../makefiles/common.mk +include ../../../makefiles/examples.mk + +# Target executable +TARGET = allreduce_device_api + +# Common utilities +COMMON_INC = ../../common/include +COMMON_SRC = ../../common/src + +# Build configuration +INCLUDES += -I$(COMMON_INC) + +# Source files +SOURCES = main.cu $(COMMON_SRC)/utils.cc +OBJECTS = $(SOURCES:.cu=.o) +OBJECTS := $(OBJECTS:.cc=.o) + +# Default target +all: $(TARGET) + +# Build executable +$(TARGET): $(OBJECTS) +ifeq ($(MPI),1) + $(MPICXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -o $@ +else + $(CXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -lpthread -o $@ +endif + @echo "Built target $@" + +# Compile source files +%.o: %.cu + $(NVCC) $(NVCUFLAGS) $(INCLUDES) -c $< -o $@ + +%.o: %.cc +ifeq ($(MPI),1) + $(MPICXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@ +else + $(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@ +endif + +# Test target +test: $(TARGET) + @echo "Testing $(TARGET)..." +ifeq ($(MPI),1) + @echo "Running with 2 processes" + $(MPIRUN) -np 2 ./$(TARGET) +else + @echo "Running with all available GPUs" + ./$(TARGET) +endif + +# Clean build artifacts +clean: + rm -f $(OBJECTS) $(TARGET) + +# Install target +install: $(TARGET) + @mkdir -p $(PREFIX)/bin + cp $(TARGET) $(PREFIX)/bin/ + +# Help +help: + @echo "NCCL Example: Device API Allreduce" + @echo "==============================================" + @echo "" + @echo "Targets:" + @echo " all - Build the example (default)" + @echo " test - Build and run test with all GPUs" + @echo " clean - Remove build artifacts" + @echo " install - Install to PREFIX/bin (default: /usr/local/bin)" + @echo " help - Show this help" + +.PHONY: all test clean install help diff --git a/examples/06_device_api/01_allreduce/README.md b/examples/06_device_api/01_allreduce/README.md new file mode 100644 index 000000000..b403c6321 --- /dev/null +++ b/examples/06_device_api/01_allreduce/README.md @@ -0,0 +1,218 @@ + + +# NCCL Device API AllReduce Example + +This example shows how to implement AllReduce sum operation directly in a kernel +using the NCCL device API. We first create a device communicator with +`ncclDevCommCreate` to enable kernel-initiated communication. After that, +device-side synchronization is performed with barriers and symmetric memory +windows are used to enable Load Store Accessible (LSA) memory access of peers. + +## Overview + +This example shows how to implement AllReduce sum operation using a GPU kernel +that directly performs the collective operations. The device communicators are +created with `ncclDevCommCreate` and device-side synchronization is ensured with +Load Store Accessible (LSA) barriers. LSA windows are used for peer memory +access. + +## What This Example Does + +1. **Creates device communicators** using `ncclDevCommCreate` for GPU kernel + access to NCCL operations +2. **Registers symmetric memory windows** with `ncclCommWindowRegister` for + direct peer-to-peer access +3. **Launches GPU kernel** that performs AllReduce sum operation entirely on + device using LSA barriers + +## Building and Running + +The advanced examples can be built using either pthread or MPI for +parallelization. pthread is the default choice. To use MPI the user needs to set +`MPI=1` at build time and can optionally provide a valid MPI installation under +`MPI_HOME`. + +### Build +```shell +make [MPI=1] [MPI_HOME=] [NCCL_HOME=] [CUDA_HOME=] +``` + +### Run when compiled for pthreads (default) +```shell +[NTHREADS=N] ./allreduce_device_api +``` + +### Run when compiled for MPI +```shell +mpirun -np ./allreduce_device_api +``` + +## Code Walk-through + +### Device Communicator Creation (Host-side) +The `ncclDevComm` is the core component of the device API, enabling GPU kernels +to perform inter-GPU communication and fuse computation with communication. The +`ncclDevCommRequirements` specifies what resources the device communicator +should allocate. In this example, we set `lsaBarrierCount` to match our thread +block count, giving each block its own barrier for independent cross-GPU +synchronization. + +```cpp +ncclDevComm devComm; +ncclDevCommRequirements reqs; +// Allocate one barrier per CTA we intend to launch +reqs.lsaBarrierCount = NCCL_DEVICE_CTA_COUNT; + +// Create device communicator with LSA barrier support +NCCLCHECK(ncclDevCommCreate(comm, &reqs, &devComm)); +``` + +### Memory Window Registration (Host-side) +The device API requires symmetric memory windows registered using +`NCCL_WIN_COLL_SYMMETRIC`. See the [symmetric memory +example](../../05_symmetric_memory/) for allocation and requirements details. + +```cpp +ncclComm_t comm; +void* d_sendbuff; +void* d_recvbuff; +ncclWindow_t send_win; +ncclWindow_t recv_win; + +// Register symmetric windows for device-side peer access +NCCLCHECK(ncclCommWindowRegister(comm, d_sendbuff, size_bytes, &send_win, NCCL_WIN_COLL_SYMMETRIC)); +NCCLCHECK(ncclCommWindowRegister(comm, d_recvbuff, size_bytes, &recv_win, NCCL_WIN_COLL_SYMMETRIC)); +``` + +### LSA Barriers (Device-side) +LSA barriers enable cross-GPU synchronization from device code. Each thread +block uses `blockIdx.x` to select its dedicated barrier, allowing blocks to +progress independently while coordinating with corresponding blocks on other +GPUs. + +```cpp +// LSA barriers enable coordination between GPU threads across different ranks +// This ensures all ranks reach the same synchronization point before proceeding +ncclLsaBarrierSession bar { + ncclCoopCta(), // Barrier scope: entire CTA (thread block) + devComm, ncclTeamLsa(devComm), devComm.lsaBarrier, + blockIdx.x // Barrier index: matches our CTA index (0 to lsaBarrierCount-1) +}; +bar.sync(ncclCoopCta(), cuda::memory_order_relaxed); + +// ... + +// Release barrier ensures that we received data from everyone before we unblock the stream and allow the next kernel(s) to process the data. +// Critical for correctness in device-side collective operations +bar.sync(ncclCoopCta(), cuda::memory_order_release); +``` +### Memory Access (Device-side) +`ncclGetLsaPointer` allows CUDA kernels to directly access other GPUs' memory +within the LSA team. + +```cpp +// Access peer memory directly using LSA (Load/Store Accessible) pointers +float* peerPtr = (float*)ncclGetLsaPointer(sendwin, sendoffset, peer); +``` + +## Expected Output + +``` +Starting Device API AllReduce initialization + Rank 0 using GPU device 0 + Rank 1 using GPU device 1 + Rank 2 using GPU device 2 + Rank 3 using GPU device 3 + Rank 0 initialized NCCL communicator for 4 total ranks + Rank 1 initialized NCCL communicator for 4 total ranks + Rank 2 initialized NCCL communicator for 4 total ranks + Rank 3 initialized NCCL communicator for 4 total ranks + Rank 0 initialized data with value 0 + Rank 1 initialized data with value 1 + Rank 2 initialized data with value 2 + Rank 3 initialized data with value 3 + Rank 0 created device communicator with 16 LSA barriers + Rank 1 created device communicator with 16 LSA barriers + Rank 2 created device communicator with 16 LSA barriers + Rank 3 created device communicator with 16 LSA barriers +Starting AllReduce with 1048576 elements (4 MB) using Device API +Expected result: sum of ranks 0 to 3 = 6 per element + Rank 0 completed AllReduce kernel execution + Rank 1 completed AllReduce kernel execution + Rank 2 completed AllReduce kernel execution + Rank 3 completed AllReduce kernel execution +AllReduce completed. Result verification: PASSED +All elements correctly sum to 6 (ranks 0-3) +``` + +## When to Use + +- **Kernel-level communication**: When compute kernels need immediate access to + communication results +- **Low-latency scenarios**: Reduced host-device synchronization overhead +- **Custom collectives**: Implementing specialized reduction or communication + patterns +- **Iterative algorithms**: Repeated communication with minimal CPU involvement + +## Performance Considerations + +**Advantages:** +- Lower latency for small to medium message sizes +- Eliminates host-device synchronization bottlenecks +- Enables computation-communication fusion within kernels +- Direct peer memory access without CPU copying + +**Disadvantages:** +- More complex programming model requiring LSA barriers +- Requires careful memory ordering and synchronization +- Higher development complexity compared to host API +- CUDA Compute Capability 7.0+ and GPUs with P2P support (e.g., NVLink or PCI) + required. + +## Common Issues and Solutions + +### Issue: NCCL warning communicator does not support symmetric memory +NCCL selects support for symmetric memory operations based on GPU connectivity. +If the GPUs on a node are only connected through e.g. the inter-CPU link, +symmetric memory will not be supported. **Solution:** Use `nvidia-smi` to +identify and select a subset of GPUs (e.g. via `CUDA_VISIBLE_DEVICES`) connected +through NVlink or PCIe. + +### Issue: LSA barrier synchronization failures +**Solution:** Ensure `lsaBarrierCount` matches the number of thread blocks in +kernel launch configuration. + +### Issue: Memory access violations in device kernel +**Solution:** Verify memory windows are registered as `NCCL_WIN_COLL_SYMMETRIC` +and all ranks use identical buffer sizes. + +### Issue: Incomplete results or race conditions +**Solution:** Use proper memory ordering in LSA barriers +(`cuda::memory_order_relaxed` vs `cuda::memory_order_release`). + +## Performance Notes + +- These are educational examples, not optimized for performance +- Real implementations should use vectorization, loop unrolling, and memory + coalescing +- Consider NCCL's optimized device kernels for best practices related to + performance + - NCCL library implementation of device kernels for collective operations + - NCCL perf tests implementations of optimized device kernels + +## Error Handling + +The example uses comprehensive error checking for both CUDA and NCCL operations. +Device kernels should implement proper error handling for LSA operations and +memory access patterns. + +## Next Steps + +After understanding this example, explore: +- **Custom reduction operations**: Implement non-standard reduction patterns +- **Mixed host-device patterns**: Combine host and device API for complex + workflows +- **Performance optimization**: Fine-tune LSA barrier usage and memory access + patterns diff --git a/examples/06_device_api/01_allreduce/main.cu b/examples/06_device_api/01_allreduce/main.cu new file mode 100644 index 000000000..4eafe6cc1 --- /dev/null +++ b/examples/06_device_api/01_allreduce/main.cu @@ -0,0 +1,251 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "nccl.h" +#include "nccl_device.h" +#include "utils.h" +#include +#include +#include +#include +#include + +/* + * NCCL Device API AllReduce Example + * + * This example demonstrates NCCL's Device API, which enables GPU kernels to + * directly interact with NCCL without CPU intervention. This is particularly + * powerful for applications that need to perform communication + * from within CUDA kernels. + * + * Learning Objectives: + * - Understand NCCL Device API vs Host API differences + * - Learn how to register memory windows for device-side access + * - See how GPU kernels can perform collective operations directly + * - Practice LSA (Load Store Access) barrier synchronization + * + * Key Device API Concepts: + * - ncclDevComm: Device-side communicator for kernel use + * - ncclWindow_t: Memory windows that enable direct peer access + * - LSA barriers: Synchronization primitives for device-side coordination + * - ncclGetLsaPointer: Direct access to peer memory from device code + * + * When to Use Device API: + * - Compute kernels that need immediate communication results + * - Fusion of computation and communication in a single kernel + * - Reduced host-device synchronization overhead + * - Custom collective operations not available in standard NCCL + * + * Performance Considerations: + * - Lower latency than host API for small operations + * - Enables computation-communication overlap within kernels + * - Requires careful synchronization and memory ordering + * - LSA barriers add coordination overhead but enable correctness + */ + +// Device API kernel launch configuration +// CTA count must match lsaBarrierCount for proper barrier synchronization +#define NCCL_DEVICE_CTA_COUNT 16 +#define NCCL_DEVICE_THREADS_PER_CTA 512 + +// ========================================================================== +// Device Kernel Implementation +// ========================================================================== + +// Device kernel that performs AllReduce sum operation +// This kernel demonstrates direct NCCL communication from GPU threads +__global__ void simpleAllReduceKernel(ncclWindow_t sendwin, size_t sendoffset, + ncclWindow_t recvwin, size_t recvoffset, + size_t count, int root, struct ncclDevComm devComm) { + // LSA barriers enable coordination between GPU threads across different ranks + // Barrier scope: CTA (all threads in this block participate) + // Barrier index: blockIdx.x selects this CTA's dedicated barrier (one barrier per CTA) + ncclLsaBarrierSession bar { ncclCoopCta(), devComm, ncclTeamLsa(devComm), + devComm.lsaBarrier, blockIdx.x }; + bar.sync(ncclCoopCta(), cuda::memory_order_relaxed); + + const int rank = devComm.rank, nRanks = devComm.nRanks; + + // We are going to spread the workload accross all GPU ranks. + // So calculate the global thread ID accross all ranks. + // This maps global threads to data elements in the data to be reduced + const int globalTid = threadIdx.x + blockDim.x * (rank + blockIdx.x * nRanks); + const int globalNthreads = blockDim.x * gridDim.x * nRanks; + + // Grid stride loop over all elements with the globalThreads + for (size_t offset = globalTid; offset < count; offset += globalNthreads) { + float v = 0; + // Access remote (and local [peer==rank]) memory and reduce locally + for (int peer=0; peer>>( + send_win, 0, recv_win, 0, count, 0, devComm); + + // Wait for completion - kernel performs AllReduce. + CUDACHECK(cudaStreamSynchronize(stream)); + printf(" Rank %d completed AllReduce kernel execution\n", my_rank); + + // ========================================================================== + // STEP 7: Verify Results and Cleanup Resources + // ========================================================================== + + // Verify results by copying back and checking + CUDACHECK(cudaMemcpy(h_data, d_recvbuff, size_bytes, cudaMemcpyDeviceToHost)); + float expected = (float)((total_ranks * (total_ranks - 1)) / 2); + bool success = true; + for (int i = 0; i < count; i++) { + if (h_data[i] != expected) { + success = false; + break; + } + } + + if (my_rank == 0) { + printf("AllReduce completed. Result verification: %s\n", + success ? "PASSED" : "FAILED"); + if (success) { + printf("All elements correctly sum to %.0f (ranks 0-%d)\n", + expected, total_ranks - 1); + } + } + + // Cleanup resources in proper order + free(h_data); + + // Device API specific cleanup + NCCLCHECK(ncclDevCommDestroy(comm, &devComm)); + NCCLCHECK(ncclCommWindowDeregister(comm, send_win)); + NCCLCHECK(ncclCommWindowDeregister(comm, recv_win)); + NCCLCHECK(ncclMemFree(d_sendbuff)); + NCCLCHECK(ncclMemFree(d_recvbuff)); + + // Standard NCCL cleanup + CUDACHECK(cudaStreamDestroy(stream)); + NCCLCHECK(ncclCommFinalize(comm)); + NCCLCHECK(ncclCommDestroy(comm)); + + return NULL; +} + +int main(int argc, char* argv[]) { + // Run example using the provided utility framework + return run_example(argc, argv, allReduce); +} diff --git a/examples/06_device_api/Makefile b/examples/06_device_api/Makefile new file mode 100644 index 000000000..c217592f8 --- /dev/null +++ b/examples/06_device_api/Makefile @@ -0,0 +1,47 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +# NCCL Shared Memory Examples +EXAMPLES = 01_allreduce + +# Default target +all: $(EXAMPLES) + +# Build individual examples +$(EXAMPLES): + $(MAKE) -C $@ + +# Clean all build artifacts +clean: + for example in $(EXAMPLES); do \ + $(MAKE) -C $$example clean; \ + done + +# Test all examples +test: all + for example in $(EXAMPLES); do \ + echo "Testing $$example..."; \ + $(MAKE) -C $$example test; \ + done + +# Help +help: + @echo "NCCL Device API Examples" + @echo "========================" + @echo "" + @echo "Targets:" + @echo " all - Build all examples" + @echo " clean - Clean all build artifacts" + @echo " test - Test all examples" + @echo " help - Show this help" + @echo "" + @echo "Examples:" + @echo " 01_allreduce - AllReduce collective operation" + @echo "" + @echo "To build/run individual examples:" + @echo " make -C 01_allreduce" + +.PHONY: all clean test help $(EXAMPLES) diff --git a/examples/06_device_api/README.md b/examples/06_device_api/README.md new file mode 100644 index 000000000..3a5e17ae5 --- /dev/null +++ b/examples/06_device_api/README.md @@ -0,0 +1,70 @@ + + +# NCCL Device API Examples + +## Overview +This directory contains minimal examples that demonstrate NCCL's device API, +enabling users to perform inter-GPU communication within their own kernels. + +## Examples + +### [01_allreduce](01_allreduce/) +**AllReduce with Device Kernel Implementation** +- **Pattern**: GPU kernel performs collectives using device communicators +- **API**: `ncclDevCommCreate`, `ncclCommWindowRegister`, device-side LSA + barriers, `ncclAllReduce` +- **Use case**: Allreduce operations with custom operations, fusing allreduce + operation with previous/next compute operation. +- **Key features**: + - Device communicator creation with LSA barrier support + - Symmetric memory windows for peer memory access + - Device kernels coordinating via LSA barriers + - Host launches kernel; kernel performs AllReduce on-device + +## Choosing the Right Pattern + +*Scenario* : Custom kernels fusing computation and communication. +*Addresses* : Schedule communication from inside a CUDA kernel. +*Dependencies* : pthread or MPI + +### Why the Device API? +The device API allows NCCL communication within CUDA kernels, fusing communication and computation steps: +```cpp +// Host: +// 1) Create device communicator + requirements +// 2) Register symmetric memory window for peer access +ncclDevComm devComm; ncclDevCommRequirements reqs{}; +reqs.lsaBarrierCount = NCCL_DEVICE_CTA_COUNT; +NCCLCHECK(ncclDevCommCreate(comm, &reqs, &devComm)); +NCCLCHECK(ncclCommWindowRegister(comm, buffer, size, &win, NCCL_WIN_COLL_SYMMETRIC)); + +// Device: +// - Use barriers for cross-GPU synchronization +// - Access peers via symmetric window (LSA pointers) +myAllReduceKernel<<>>(win, devComm); +``` + +## Building + +### **Quick Start** +```shell +# Build example by directory name +make 01_allreduce +``` + +### **Individual Examples** +```shell +# Build and run the device API AllReduce +cd 01_allreduce && make +./allreduce_device_api +``` + +## References +- [NCCL User Guide: + Examples](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html) +- [NCCL API + Reference](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html) +- [CUDA Programming + Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/) diff --git a/examples/Makefile b/examples/Makefile new file mode 100644 index 000000000..9a9cd1b3d --- /dev/null +++ b/examples/Makefile @@ -0,0 +1,54 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +# NCCL Examples Main Makefile + +# Define all category directories +CATEGORIES := 01_communicators 02_point_to_point 03_collectives 04_user_buffer_registration 05_symmetric_memory 06_device_api + +# Default target +all: $(CATEGORIES) + +# Build all categories +$(CATEGORIES): + @echo "Building $@..." + @$(MAKE) -C $@ + +# Clean all categories +clean: + @echo "Cleaning all examples..." + @for category in $(CATEGORIES); do \ + $(MAKE) -C $$category clean; \ + done + +# Test all examples +test: all + @echo "Testing all examples..." + @for category in $(CATEGORIES); do \ + $(MAKE) -C $$category test; \ + done + +# Install all examples +install: all + @echo "Installing all examples..." + @for category in $(CATEGORIES); do \ + $(MAKE) -C $$category install; \ + done + +# Help target +help: + @echo "NCCL Examples Main Makefile" + @echo "===========================" + @echo "" + @echo "Available targets:" + @echo " all - Build all examples" + @echo " clean - Clean all build artifacts" + @echo " test - Build and test all examples" + @echo " install - Install all examples" + @echo " help - Show this help message" + @echo "" + +.PHONY: all clean test install help $(CATEGORIES) diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 000000000..f491eab5a --- /dev/null +++ b/examples/README.md @@ -0,0 +1,146 @@ + + +# NCCL Library Examples + +Welcome to the NCCL examples directory. This collection of NCCL (NVIDIA +Collective Communications Library) examples is designed to teach developers how +to effectively use NCCL in their applications. The examples progress from basic +concepts to advanced usage patterns, with each example featuring a detailed +README file. The APIs and features covered here are far from the complete set of +what NCCL provides. The [NCCL +Documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html) +includes a detailed description of NCCL features and APIs. + +These examples showcase individual features but are not intended to maximize the +performance for an individual communication pattern. For a performance +implementation please refer to the +[nccl-tests](https://github.com/NVIDIA/nccl-tests/) GitHub repository. + +## Basic Examples +We start with the most basic NCCL operations. All examples in this section are +self-contained, meaning you can copy-paste one file and compile it on its own. +The only dependencies are the NCCL library itself and, in the MPI case, an MPI +implementation. These templates tend to new users coming up to speed with NCCL +for GPU communication. + +### [Communicators](01_communicators/) + +This section teaches you how to create, test, and destroy a communicator. We +have provided 3 examples using a single thread, multiple threads, and multiple +processes. This section shows the different options of launching an NCCL +application. + +### [Point 2 Point](02_point_to_point/) + +This sample send/recv implementation uses point to point communication to +communicate in a simple ring pattern. + +### [Collectives](03_collectives/) + +This sample implementation shows the most basic NCCL collective communication +call. + +## Advanced Features + +These examples are intended for experienced users looking for best practices to +use a specific feature. For complete end-to-end templates please use the basic +examples. + +Since NCCL does not include its own launcher, we have provided two popular +bootstrap mechanisms. By default these examples will be launched as separate +threads, one thread per GPU. Users can set `MPI=1` to build an MPI parallel +version which can run across multiple compute nodes. Users can optionally +provide a valid MPI installation under `MPI_HOME`. + +Each example can be run individually. By default you can run each executable via +``` +[NTHREADS=] ./ +``` + +If `NTHREADS` is unset, the examples will use the number of visible GPUs as +number threads. If the applications are built with `MPI` support, you can run +each executable as + +``` +mpirun -np ./ +``` + +To ease the readability of these examples we have moved the bootstrap and +broadcast part to the [common](common/) directory. Completely self-contained +examples are provided in the sections above. + +### [User Buffer Registration](04_user_buffer_registration/) + +User Buffer Registration eliminates the overhead of copying data between user +buffers and internal NCCL buffers. This folder provides sample implementation +using User Buffer Registration with common collectives. + +### [Symmetric Memory Registration](05_symmetric_memory/) + +Since 2.27, NCCL supports window registration, which allows users to register +local buffers into NCCL window and enables extreme low latency and high +bandwidth communication in NCCL. This folder provides sample implementation +using Symmetric Memory Registration with common collectives. + +### [Device APIs](06_device_api/) + +Device API enables GPU kernels to directly perform inter-GPU communication. This +enables applications to perform communication from within CUDA kernels and fuse +computation and communication, and fine-grained control over collective +implementation. This folder demonstrates how to implement collectives using +device-side kernels. + + +## Prerequisites + +- The same prerequisites as building NCCL from source. +- Users can optionally add `MPI_HOME` for an MPI library in a non-standard + location. + +## Build Steps +The examples can be built while building the NCCL library from source. Users can +choose to build the examples with MPI support (`MPI=1`). + +``` +git clone https://github.com/NVIDIA/nccl.git +cd nccl +make -j examples [MPI=1] +``` + +or, if NCCL has already been built, the user can optionally add a non-standard +NCCL installation location: + +``` +cd examples +make NCCL_HOME= [MPI=1] +``` +## Environment Variables + +### Build Stage +Users can use these optional variables to choose which libraries are used to +build these examples: +- `NCCL_HOME=`: Local base directory of a NCCL installation. +- `MPI` : [0,1] Build the examples with MPI support. +- `MPI_HOME=` : Local base directory of a MPI installation. +- `CUDA_HOME=` : Local base directory of a CUDA installation. + +### Run Stage +- `NTHREADS=`: Number of threads to create for the threaded examples. + Defaults to number of visible GPUs. +- `CUDA_VISIBLE_DEVICES`: Comma delimited list of GPUs visible to the + application. +- All other NCCL [environment + variables](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html) + apply. + +## Supported OS +Linux + +## Troubleshooting +Each example includes a /Common Issues and Solutions/ section for the individual +tests. For general runtime issues use the debug output enabled by setting +`NCCL_DEBUG=INFO` for detailed logging. The +[Troubleshooting](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html) +section of the NCCL documentation also includes many helpful tips. diff --git a/examples/common/README.md b/examples/common/README.md new file mode 100644 index 000000000..006c2b9a0 --- /dev/null +++ b/examples/common/README.md @@ -0,0 +1,36 @@ +# NCCL Common Utilities + +## Description +This directory contains shared utilities and helper functions used across all NCCL examples. These utilities provide common functionality for error handling, device management, and MPI integration. + +## Components + +### Headers (`include/`) +- **utils.h**: General utility functions +- **nccl_utils.h**: NCCL error checking macros +- **mpi_utils.h**: MPI error checking macros + +### Source Files (`src/`) +- **utils.cc**: General utility functions + +## Key Features + +### Error Checking Macros +```c +#define NCCLCHECK(cmd) // NCCL error checking +#define CUDACHECK(cmd) // CUDA error checking +#define MPICHECK(cmd) // MPI error checking +``` + +## Usage in Examples +Include the headers in your example source files: +```c +#include "utils.h" +#include "mpi_utils.h" +``` + +## Notes +- All utilities include comprehensive error checking +- Functions are designed to be thread-safe +- Memory management functions handle null pointers safely +- MPI utilities are only needed for multi-process examples diff --git a/examples/common/include/mpi_utils.h b/examples/common/include/mpi_utils.h new file mode 100644 index 000000000..151b730d2 --- /dev/null +++ b/examples/common/include/mpi_utils.h @@ -0,0 +1,23 @@ +#ifndef MPI_UTILS_H_ +#define MPI_UTILS_H_ + +#include "mpi.h" +#include +#include + +// MPI error checking macro +#define MPICHECK(cmd) \ + do { \ + int err = cmd; \ + if (err != MPI_SUCCESS) { \ + char error_string[MPI_MAX_ERROR_STRING]; \ + int length; \ + MPI_Error_string(err, error_string, &length); \ + fprintf(stderr, "MPI error at %s:%d - %s\n", __FILE__, __LINE__, \ + error_string); \ + fprintf(stderr, "Failed MPI operation: %s\n", #cmd); \ + MPI_Abort(MPI_COMM_WORLD, err); \ + } \ + } while (0) + +#endif diff --git a/examples/common/include/nccl_utils.h b/examples/common/include/nccl_utils.h new file mode 100644 index 000000000..ff98de1de --- /dev/null +++ b/examples/common/include/nccl_utils.h @@ -0,0 +1,40 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_UTILS_H_ +#define NCCL_UTILS_H_ + +#include +#include +#include +#include + +#include "cuda_runtime.h" + +// Error checking +#define NCCLCHECK(cmd) \ + do { \ + ncclResult_t res = cmd; \ + if (res != ncclSuccess) { \ + fprintf(stderr, "Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, \ + ncclGetErrorString(res)); \ + fprintf(stderr, "Failed NCCL operation: %s\n", #cmd); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define CUDACHECK(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + fprintf(stderr, "Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \ + cudaGetErrorString(err)); \ + fprintf(stderr, "Failed CUDA operation: %s\n", #cmd); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#endif diff --git a/examples/common/include/utils.h b/examples/common/include/utils.h new file mode 100644 index 000000000..ec3dec334 --- /dev/null +++ b/examples/common/include/utils.h @@ -0,0 +1,55 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef UTILS_H_ +#define UTILS_H_ + +#include "cuda_runtime.h" +#include "nccl.h" +#include "nccl_utils.h" +#include +#include + +#ifdef MPI_SUPPORT +#include "mpi.h" +#include "mpi_utils.h" +#else +#include +#include +#endif + +/** + * Broadcast NCCL unique ID + * + * Broadcasts the NCCL unique ID from the root rank to all other ranks. + * Uses MPI_Bcast in MPI mode and pthread barrier in pthread mode. + * + * @param root Root rank that holds the NCCL unique ID + * @param my_rank Current rank or thread id + * @param arg Pointer to NCCL unique ID to broadcast + * + * @return 0 on success, non-zero on error + */ +int util_broadcast(int root, int my_rank, ncclUniqueId *arg); + +/** + * Run the given NCCL example in parallel + * + * This function performs the complete NCCL example lifecycle: + * 1. Initialize backend (MPI or pthread) + * 2. Execute NCCL communicator setup function + * 3. Cleanup of all resources + * + * @param argc Command line argument count + * @param argv Command line arguments + * @param ncclExample Function pointer to example-specific NCCL setup + * + * @return 0 on success, non-zero on error + */ +int run_example(int argc, char *argv[], + void *(*ncclExample)(int, int, int, int)); + +#endif // UTILS_H_ diff --git a/examples/common/src/utils.cc b/examples/common/src/utils.cc new file mode 100644 index 000000000..20a411cbc --- /dev/null +++ b/examples/common/src/utils.cc @@ -0,0 +1,334 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "utils.h" +#include + +#ifndef MPI_SUPPORT +pthread_barrier_t barrier; +ncclUniqueId nccl_unique_id; +#endif + +/** + * Common context structure for both MPI and pthread examples + * + * This structure provides a unified interface for NCCL examples that can + * run in either MPI mode (one process per device) or pthread mode + * (one thread per device). + */ +typedef struct { + // Common variables + int total_ranks; // Total number of MPI ranks or pthreads + int devices_per_rank; // Number of devices per rank or thread + int local_device; // Node local rank or thread id (0 to total_ranks-1) + int my_rank; // Rank or thread id for NCCL + ncclUniqueId nccl_id; // NCCL unique ID + ncclComm_t comm; // NCCL communicator (single for all modes) + ncclUniqueId *nccl_unique_id; // NCCL unique ID pointer + void *func; + + // pthread-specific variables +#ifndef MPI_SUPPORT + pthread_t *threads; // Thread array + int *thread_ranks; // Thread rank array +#endif +} context_t; + +/** + * Initialize MPI or pthread backend + * + * Sets up the backend and populates common context variables. + * + * MPI Mode (compiled with MPI=1): + * - Initializes MPI (rank, size) + * - Calculates local rank based on splitting communicator by node multi-node + * support + * - Generates and broadcasts NCCL unique ID + * - Sets device assignment based on local rank + * + * pthread Mode (default): + * - Gets thread count from NTHREADS environment or GPU count + * - Validates thread count against available GPUs + * - Generates NCCL unique ID for sharing across threads + * - Allocates thread management resources + * + * @param argc Command line argument count + * @param argv Command line arguments + * @param ctx Output: Populated example context + * + * @return 0 on success, non-zero on error + */ +int initialize(int argc, char *argv[], context_t *ctx); + +/** + * Wrap function to call the example function in a thread + * + * Note: This function is needed since pthread only allows a single void* + * argument. + */ +void *thread_wrapper(void *arg); + +/** + * Run ncclExample in parallel using MPI or pthreads + * + * Starts the execution of the given NCCL example function in parallel + * + * MPI Mode: + * - Starts the function on each rank + * - Checks the output and calls MPI_Barrier to synchronize + * + * pthread Mode: + * - Creates threads (one per device) + * - Each thread runs the given example function + * - Waits for all threads to complete + * + * @param ctx Context with backend setup completed + * @param ncclExample Function pointer to example-specific NCCL setup + * + * @return 0 on success, non-zero on error + * + * Note: This function expects ncclExample() to be defined by the example. + * The ncclExample function should have signature: + * void* ncclExample(int, int, int, int) + */ +int run_parallel(context_t *ctx, void *(*ncclExample)(int, int, int, int)); + +/** + * Clean up resources + * + * Properly cleans up all resources allocated during initialization. + * Note: NCCL communicators are destroyed by ncclCommSetup function. + * + * MPI Mode: + * - Finalizes MPI + * + * pthread Mode: + * - Frees thread arrays + * + * @param ctx Context to clean up + */ +void cleanup(context_t *ctx); + +/** + * Broadcast NCCL unique ID + */ +int util_broadcast(int root, int my_rank, ncclUniqueId *arg) { +#ifdef MPI_SUPPORT + MPICHECK( + MPI_Bcast(arg, sizeof(ncclUniqueId), MPI_BYTE, root, MPI_COMM_WORLD)); +#else + if (my_rank == root) { + nccl_unique_id = *arg; + } + int barrier_err = pthread_barrier_wait(&barrier); + if (barrier_err != 0 && barrier_err != PTHREAD_BARRIER_SERIAL_THREAD) { + fprintf(stderr, "pthread_barrier_wait failed at %s:%d with error code %d\n", + __FILE__, __LINE__, barrier_err); + abort(); + } + if (my_rank != root) { + *arg = nccl_unique_id; + } +#endif + return 0; +} + +/** + * Initialize MPI or pthread backend + */ +int initialize(int argc, char *argv[], context_t *ctx) { +#ifdef MPI_SUPPORT + // Initialize MPI + MPICHECK(MPI_Init(&argc, &argv)); + MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &ctx->my_rank)); + MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &ctx->total_ranks)); + + if (ctx->my_rank == 0) { + printf("Number of processes: %d\n", ctx->total_ranks); + } + // Only for printing the output in order + MPI_Barrier(MPI_COMM_WORLD); + printf("MPI initialized: rank %d of %d\n", ctx->my_rank, ctx->total_ranks); + + // Split the communicator based on shared memory (i.e., nodes) + MPI_Comm node_comm; + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, ctx->my_rank, + MPI_INFO_NULL, &node_comm); + + // Get the rank within the node communicator + MPI_Comm_rank(node_comm, &ctx->local_device); + + // Clean up the node communicator + MPI_Comm_free(&node_comm); + +#else + + // Get number of devices (threads) from environment or default to available + // GPUs + int num_gpus = 0; + CUDACHECK(cudaGetDeviceCount(&num_gpus)); + ctx->total_ranks = num_gpus; // Default to all available GPUs + const char *nThreadsEnv = getenv("NTHREADS"); + if (nThreadsEnv) { + ctx->total_ranks = atoi(nThreadsEnv); + } + + printf("Creating %d threads for %d devices\n", ctx->total_ranks, num_gpus); + + if (ctx->total_ranks < 1) { + printf("Invalid number of threads: %d\n", ctx->total_ranks); + return 1; + } + + // Check if we have enough GPUs + if (ctx->total_ranks > num_gpus) { + printf("Error: Requested %d threads but only %d GPUs available\n", + ctx->total_ranks, num_gpus); + printf("Please reduce NTHREADS to %d or fewer\n", num_gpus); + return 1; + } + + // Thread synchronization needed for unique ID sharing later on + pthread_barrier_init(&barrier, NULL, ctx->total_ranks); + + // Generate NCCL unique ID (shared across all threads) + NCCLCHECK(ncclGetUniqueId(&ctx->nccl_id)); + + // Allocate thread resources + ctx->threads = (pthread_t *)malloc(ctx->total_ranks * sizeof(pthread_t)); + ctx->thread_ranks = (int *)malloc(ctx->total_ranks * sizeof(int)); + if (ctx->threads == NULL || ctx->thread_ranks == NULL) { + printf("Failed to allocate memory for threads\n"); + return 1; + } +#endif + + return 0; +} + +/** + * Wrap function to call the example function in a thread + * + * Note: This function is needed since pthread only allows a single void* + * argument. + */ +void *thread_wrapper(void *arg) { + context_t *ctx = (context_t *)arg; + void *(*example_func)(int, int, int, int) = + (void *(*)(int, int, int, int))ctx->func; + return example_func(ctx->my_rank, ctx->total_ranks, ctx->local_device, + ctx->devices_per_rank); +} + +/** + * Run ncclExample in parallel using MPI or pthreads + */ +int run_parallel(context_t *ctx, void *(*ncclExample)(int, int, int, int)) { +#ifdef MPI_SUPPORT + if (ctx->my_rank == 0) { + printf("NCCL Example: One Device per Process\n"); + printf("====================================\n"); + } + + if (ncclExample(ctx->my_rank, ctx->total_ranks, ctx->local_device, + ctx->devices_per_rank) != NULL) + return 1; + // Synchronize to ensure ordered output + MPICHECK(MPI_Barrier(MPI_COMM_WORLD)); +#else + printf("NCCL Example: One Device per Thread\n"); + printf("===================================\n"); + + // Create separate context for each thread + context_t *thread_contexts = + (context_t *)malloc(ctx->total_ranks * sizeof(context_t)); + if (thread_contexts == NULL) { + printf("Failed to allocate thread contexts\n"); + return 1; + } + ncclUniqueId *nccl_unique_id = + (ncclUniqueId *)calloc(1, sizeof(ncclUniqueId)); + + for (int i = 0; i < ctx->total_ranks; i++) { + // Copy main context to thread context + memcpy(&thread_contexts[i], ctx, sizeof(context_t)); + thread_contexts[i].threads = NULL; + thread_contexts[i].thread_ranks = NULL; + thread_contexts[i].my_rank = i; // Set NCCL rank to thread id + thread_contexts[i].local_device = i; + thread_contexts[i].total_ranks = ctx->total_ranks; + thread_contexts[i].devices_per_rank = 1; + thread_contexts[i].func = (void *)ncclExample; + thread_contexts[i].nccl_unique_id = nccl_unique_id; + ctx->thread_ranks[i] = i; + pthread_create(&ctx->threads[i], NULL, thread_wrapper, &thread_contexts[i]); + } + + // Wait for all threads to complete + for (int i = 0; i < ctx->total_ranks; i++) { + pthread_join(ctx->threads[i], NULL); + } + + free(thread_contexts); +#endif + + return 0; +} + +/** + * Run the given NCCL example in parallel + */ +int run_example(int argc, char *argv[], + void *(*ncclExample)(int, int, int, int)) { + + // 1. Allocate context + context_t *ctx = (context_t *)calloc(1, sizeof(context_t)); + if (ctx == NULL) { + printf("Failed to allocate memory for context\n"); + return 1; + } + + // 2. Initialize backend (MPI or pthread) + if (initialize(argc, argv, ctx) != 0) { + printf("Failed to initialize backend\n"); + return 1; + } + + // 3. Start the given example code in parallel + if (run_parallel(ctx, ncclExample) != 0) { + printf("Failed to execute NCCL operations\n"); + cleanup(ctx); // Cleanup on failure + return 1; + } + + // 3. Cleanup + cleanup(ctx); + + // 4. Print common success message +#ifdef MPI_SUPPORT + if (ctx->my_rank == 0) { +#endif + printf("\nAll NCCL communicators finalized successfully!\n"); +#ifdef MPI_SUPPORT + } +#endif + + return 0; +} + +/** + * Clean up resources + */ +void cleanup(context_t *ctx) { +#ifdef MPI_SUPPORT + // Free MPI resources + MPICHECK(MPI_Finalize()); +#else + free(ctx->threads); + free(ctx->thread_ranks); + pthread_barrier_destroy(&barrier); +#endif +} diff --git a/makefiles/examples.mk b/makefiles/examples.mk new file mode 100644 index 000000000..6f3a520f3 --- /dev/null +++ b/makefiles/examples.mk @@ -0,0 +1,31 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +# Make sure NCCL headers are found and libraries are linked +ifneq ($(NCCL_HOME), "") +NVCUFLAGS += -I$(NCCL_HOME)/include/ +NVLDFLAGS += -L$(NCCL_HOME)/lib +endif + +# Build configuration +INCLUDES = -I$(CUDA_HOME)/include -I$(NCCL_HOME)/include +LIBRARIES = -L$(CUDA_HOME)/lib64 -L$(NCCL_HOME)/lib +LDFLAGS = -lcudart -lnccl -Wl,-rpath,$(NCCL_HOME)/lib + + +# MPI configuration +ifeq ($(MPI), 1) + +ifdef MPI_HOME +MPICXX ?= $(MPI_HOME)/bin/mpicxx +MPIRUN ?= $(MPI_HOME)/bin/mpirun +else +MPICXX ?= mpicxx +MPIRUN ?= mpirun +endif + +CXXFLAGS += -DMPI_SUPPORT +endif From 834ef7231913ecf22f5cad29d7e26a6596f36452 Mon Sep 17 00:00:00 2001 From: Stephen Sachs Date: Tue, 14 Oct 2025 15:02:05 +0200 Subject: [PATCH 20/21] Remove the github actions to auto-close older issues --- .github/workflows/close-old-issues.js | 79 ------------------------- .github/workflows/close_old_issues.yaml | 31 ---------- 2 files changed, 110 deletions(-) delete mode 100644 .github/workflows/close-old-issues.js delete mode 100644 .github/workflows/close_old_issues.yaml diff --git a/.github/workflows/close-old-issues.js b/.github/workflows/close-old-issues.js deleted file mode 100644 index 57e110339..000000000 --- a/.github/workflows/close-old-issues.js +++ /dev/null @@ -1,79 +0,0 @@ -const { Octokit } = require("@octokit/rest"); - -const octokit = new Octokit({ auth: process.env.GITHUB_TOKEN }); - -const owner = process.env.REPO_OWNER; -const repo = process.env.REPO_NAME.split('/').pop(); // Handles owner/repo format - -const now = new Date(); -const sixMonthsAgo = new Date(now); -sixMonthsAgo.setMonth(now.getMonth() - 6); -const oneMonthAgo = new Date(now); -oneMonthAgo.setMonth(now.getMonth() - 1); - -async function closeOldIssues() { - let page = 1; - let closedCount = 0; - - // write a multiline comment into a variable: - let body = `### Issue Cleanup: Helping Us Focus on Current Challenges - -We're [reviewing](https://github.com/NVIDIA/nccl/discussions/1761) older issues to ensure we prioritize the most relevant and active ones. Since this issue hasn't seen updates in over 6 months, we'll be closing it for now. - -*This change helps us focus our efforts on addressing any current issues our users are facing.* If this issue still affects you, please don't hesitate to reopen it with a quick update (e.g., \"Still relevant on [version=X]\"). -Thanks for your understanding and for contributing to NCCL.`; - - while (true) { - const { data: issues } = await octokit.issues.listForRepo({ - owner, - repo, - state: "open", - per_page: 100, - page, - }); - - if (issues.length === 0) break; - - for (const issue of issues) { - // Ignore PRs - if (issue.pull_request) continue; - - // Ignore issues with label "ongoing" - if (issue.labels.some(label => label.name === "ongoing")) continue; - - const createdAt = new Date(issue.created_at); - const updatedAt = new Date(issue.updated_at); - - if (createdAt < sixMonthsAgo && updatedAt < sixMonthsAgo) { - - // Add a comment before closing - await octokit.issues.createComment({ - owner, - repo, - issue_number: issue.number, - body: body, - }); - - await octokit.issues.update({ - owner, - repo, - issue_number: issue.number, - state: "closed", - state_reason: "not_planned", - }); - closedCount++; - console.log(`Closed issue #${issue.number}`); - - // Break out if we have closed 100 issues - if (closedCount >= 100) { - console.log("Closed 100 issues, stopping."); - return; - } - } - } - page++; - } - console.log(`Total closed: ${closedCount}`); -} - -closeOldIssues().catch(console.error); diff --git a/.github/workflows/close_old_issues.yaml b/.github/workflows/close_old_issues.yaml deleted file mode 100644 index 15d81cb54..000000000 --- a/.github/workflows/close_old_issues.yaml +++ /dev/null @@ -1,31 +0,0 @@ -name: Close Old Issues - -on: - schedule: - - cron: '30 2 * * *' # Runs daily at 02:30 UTC - workflow_dispatch: - -permissions: - issues: write - -jobs: - close-old-issues: - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: 20 - - - name: Install dependencies - run: npm install @octokit/rest@22.0.0 - - - name: Run close-old-issues script - run: node .github/workflows/close-old-issues.js - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - REPO_OWNER: ${{ github.repository_owner }} - REPO_NAME: ${{ github.event.repository.name || github.repository }} From ae7aed194dc63c65d1bf5c0385ba3d68d3b64c8c Mon Sep 17 00:00:00 2001 From: Mark Santesson Date: Fri, 17 Oct 2025 17:17:50 -0700 Subject: [PATCH 21/21] NCCL 2.28.7-1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GPU-Initiated Networking (GIN): * Provides device-side API for integrating GPU-Initiated Networking capability into application kernels. * New transport layer called DOCA GPUNetIO. * New ncclGin construct to create, destroy and manipulate GIN contexts. * New ncclGinBarrierSession to provide synchronization functionality. * New put, signal, counter operations for data movement and signaling. * GIN API signatures and functionalities are subject to change. * GIN Support Requirements * CUDA 12.2 or later when compiling the GPU code * NVIDIA GPUs: Volta or newer. NVIDIA GPU drivers >= 510.40.3 * NVIDIA NICs: CX4 or newer. rdma-core >= 44.0 * Requires nvidia-peermem or DMABUF support. When using DMABUF, linux kernel >= 6.1 is required. New ncclCommRevoke API for fault tolerance: * Introduces ncclCommRevoke to quiesce ongoing NCCL work on a communicator without freeing resources. * This answers the need for a lightweight way to cancel in-flight collectives and bring a communicator to a safe state before split/shrink/finalize/destroy. * Includes optional cross-rank coordination (global barrier) and supports blocking/non-blocking usage. New NCCL Environment Plugin: * The env plugin allows users to set NCCL environment variables, for example, after loading them from a centralized database. * The NCCL_ENV_PLUGIN variable can be used to let NCCL load an external environment plugin. New NCCL Examples on GitHub: * The NCCL examples directory provides users and developers with practical code samples that highlight NCCL’s core features. * It covers basic operations like communicator initialization, point-to-point communication, and collective operations, as well as advanced features such as user buffer registration, symmetric memory, and the device API. Device API improvements: * Adds ncclFindWindow API. * Adds new ncclBarrierSession to provide hybrid synchronization functionality. * Makes multimem available with as few as two ranks. * Removes distance (NCCL_P2P_LEVEL) considerations from determining the availability of symmetric memory. Enhanced NCCL RAS output: * Extends RAS subsystem with JSON format to support machine-parsable metrics collection. * Enables structured data export for monitoring tools, dashboards, and automated analysis systems. Github Pull Requests resolved: * Fast Init - CPU Optimizations for NCCL Initialization Large Scale. (PR #1789) * Fast Init - Improve Bootstrap AllGather by 2x at large scale by sending bootstrap information bidirectionally. (PR #1791) * Fixes spurious failures when PyTorch is statically linked with NCCL-2.28.3 because error is not drained, but rather gets propagated into the next CUDA kernel invocation. (PR #1864) Other notable improvements: * Fixes multicast object leaks in case of failed NVLS user buffer registrations, which could lead to crashes. Avoids such registration attempts in case of the use of incompatible memory allocators. * Fixes potential data corruption with built-in symmetric kernels for small messages with size granularity under 8 bytes or when multiple symmetric operations were aggregated in a group. * Generalizes the existing point-to-point scheduling to the case of un-even GPU count per node. * Fixes a crash when network plugin assignment fails. * Fixes a large performance issue with NCCL_CROSS_NIC=0 and certain split mask settings, where NCCL cannot find a viable ring. * Fixes crash when NCCL is compiled with recent CUDA versions but running on hosts with certain specific older CUDA drivers. --- CMakeLists.txt | 4 + ext-tuner/example/plugin.c | 2 +- ext-tuner/example/test/test_plugin.c | 6 +- makefiles/common.mk | 9 +- makefiles/version.mk | 2 +- src/CMakeLists.txt | 28 +- src/Makefile | 63 +- src/bootstrap.cc | 51 +- src/ce_coll.cc | 26 +- src/debug.cc | 17 +- src/dev_runtime.cc | 198 +- src/device/CMakeLists.txt | 2 +- src/device/Makefile | 3 +- src/device/generate.py | 10 +- src/device/network/unpack/unpack.h | 2 +- src/device/reduce_kernel.h | 2 +- src/device/symmetric/all_gather.cuh | 2 +- src/device/symmetric/generate.py | 10 +- src/device/symmetric/primitives.cuh | 13 +- src/device/symmetric/reduce_scatter.cuh | 6 +- src/enqueue.cc | 40 +- src/gin/CMakeLists.txt | 8 + src/gin/gin_host.cc | 277 + src/gin/gin_host_proxy.cc | 501 ++ src/graph/paths.cc | 38 +- src/graph/rings.cc | 29 +- src/graph/search.cc | 80 +- src/graph/topo.cc | 92 +- src/graph/topo.h | 2 + src/graph/tuning.cc | 2 +- src/graph/xml.cc | 29 +- src/include/allocator.h | 4 + src/include/channel.h | 7 +- src/include/checks.h | 15 + src/include/comm.h | 13 +- src/include/debug.h | 8 + src/include/dev_runtime.h | 1 + src/include/device.h | 6 +- src/include/env.h | 23 + src/include/gin/gin_host.h | 54 + src/include/gin/gin_host_proxy.h | 28 + src/include/graph.h | 11 +- src/include/group.h | 4 + src/include/nccl_device.h | 10 +- src/include/nccl_device/barrier.h | 47 + src/include/nccl_device/coop.h | 73 +- src/include/nccl_device/core.h | 19 + src/include/nccl_device/gin.h | 207 + src/include/nccl_device/gin/gdaki/gin_gdaki.h | 214 + .../gin/gdaki/gin_gdaki_device_host_common.h | 36 + src/include/nccl_device/gin/gin_device_api.h | 18 + .../nccl_device/gin/gin_device_common.h | 120 + .../nccl_device/gin/gin_device_host_common.h | 24 + src/include/nccl_device/gin/proxy/gin_proxy.h | 235 + .../gin/proxy/gin_proxy_device_host_common.h | 125 + src/include/nccl_device/gin_barrier.h | 37 + src/include/nccl_device/impl/barrier__funcs.h | 94 + src/include/nccl_device/impl/barrier__types.h | 29 + src/include/nccl_device/impl/comm__types.h | 13 +- src/include/nccl_device/impl/core__funcs.h | 32 + src/include/nccl_device/impl/core__types.h | 4 +- src/include/nccl_device/impl/gin__funcs.h | 407 ++ src/include/nccl_device/impl/gin__types.h | 10 + .../nccl_device/impl/gin_barrier__funcs.h | 66 + .../nccl_device/impl/gin_barrier__types.h | 31 + ..._barrier__funcs.h => lsa_barrier__funcs.h} | 2 +- ..._barrier__types.h => lsa_barrier__types.h} | 2 +- src/include/nccl_device/ll_a2a.h | 4 +- .../{mem_barrier.h => lsa_barrier.h} | 0 src/include/{ => nccl_device}/net_device.h | 7 +- src/include/nccl_device/utility.h | 74 +- src/include/net.h | 3 + src/include/nvtx.h | 3 +- src/include/nvtx_payload_schemas.h | 3 +- src/include/plugin/env/env_v1.h | 33 + src/include/plugin/nccl_env.h | 16 + src/include/plugin/nccl_net.h | 7 +- src/include/plugin/net/net_v11.h | 50 +- src/include/plugin/plugin.h | 2 + src/include/proxy.h | 3 + src/include/register.h | 3 + src/include/socket.h | 9 +- src/include/sym_kernels.h | 3 +- src/include/transport.h | 13 +- src/include/utils.h | 10 + src/init.cc | 374 +- src/libnccl.map | 8 + src/misc/ibvsymbols.cc | 2 +- src/misc/ibvwrap.cc | 5 + src/misc/param.cc | 5 +- src/misc/socket.cc | 23 + src/nccl.h.in | 12 + src/nccl_device/CMakeLists.txt | 3 +- src/nccl_device/gin_barrier.cc | 22 + .../{mem_barrier.cc => lsa_barrier.cc} | 2 +- src/plugin/CMakeLists.txt | 3 + src/plugin/env.cc | 111 + src/plugin/env/CMakeLists.txt | 7 + src/plugin/env/env_v1.cc | 40 + src/plugin/net.cc | 147 +- src/plugin/net/net_v10.cc | 3 +- src/plugin/net/net_v11.cc | 11 +- src/plugin/net/net_v6.cc | 1 - src/plugin/net/net_v7.cc | 1 - src/plugin/net/net_v8.cc | 1 - src/plugin/net/net_v9.cc | 1 - src/plugin/plugin_open.cc | 14 +- src/proxy.cc | 1 + src/ras/client.cc | 56 +- src/ras/client_support.cc | 310 +- src/ras/ras_internal.h | 9 + src/register/register.cc | 15 +- src/scheduler/symmetric_sched.cc | 4 + src/sym_kernels.cc | 18 +- src/transport.cc | 56 +- src/transport/CMakeLists.txt | 11 + src/transport/gdaki/CMakeLists.txt | 65 + .../include/common/doca_gpunetio_verbs_def.h | 398 ++ .../include/common/doca_gpunetio_verbs_dev.h | 203 + .../device/doca_gpunetio_dev_verbs_common.cuh | 422 ++ .../doca_gpunetio_dev_verbs_counter.cuh | 421 ++ .../device/doca_gpunetio_dev_verbs_cq.cuh | 295 + .../doca_gpunetio_dev_verbs_onesided.cuh | 508 ++ .../device/doca_gpunetio_dev_verbs_qp.cuh | 824 +++ .../include/doca_gpunetio_config.h | 45 + .../include/doca_gpunetio_device.h | 47 + .../include/doca_gpunetio_host.h | 49 + .../doca-gpunetio/include/host/doca_error.h | 89 + .../include/host/doca_gpunetio.h | 387 ++ .../include/host/doca_gpunetio_high_level.h | 191 + .../doca-gpunetio/include/host/doca_verbs.h | 2467 +++++++ .../doca-gpunetio/include/host/mlx5_ifc.h | 5693 +++++++++++++++++ .../doca-gpunetio/include/host/mlx5_prm.h | 170 + .../gdaki/doca-gpunetio/src/doca_gpunetio.cpp | 942 +++ .../src/doca_gpunetio_gdrcopy.cpp | 261 + .../doca-gpunetio/src/doca_gpunetio_gdrcopy.h | 55 + .../src/doca_gpunetio_high_level.cpp | 903 +++ .../doca-gpunetio/src/doca_gpunetio_log.cpp | 77 + .../doca-gpunetio/src/doca_gpunetio_log.hpp | 43 + .../gdaki/doca-gpunetio/src/doca_internal.hpp | 118 + .../gdaki/doca-gpunetio/src/doca_verbs_cq.cpp | 472 ++ .../gdaki/doca-gpunetio/src/doca_verbs_cq.hpp | 151 + .../src/doca_verbs_cuda_wrapper.cpp | 129 + .../src/doca_verbs_cuda_wrapper.h | 96 + .../src/doca_verbs_device_attr.cpp | 266 + .../src/doca_verbs_device_attr.hpp | 96 + .../src/doca_verbs_ibv_wrapper.cpp | 374 ++ .../src/doca_verbs_ibv_wrapper.h | 452 ++ .../src/doca_verbs_mlx5dv_wrapper.cpp | 287 + .../src/doca_verbs_mlx5dv_wrapper.h | 431 ++ .../src/doca_verbs_net_wrapper.h | 62 + .../gdaki/doca-gpunetio/src/doca_verbs_qp.cpp | 2743 ++++++++ .../gdaki/doca-gpunetio/src/doca_verbs_qp.hpp | 211 + .../doca-gpunetio/src/doca_verbs_srq.cpp | 580 ++ .../doca-gpunetio/src/doca_verbs_srq.hpp | 109 + .../doca-gpunetio/src/doca_verbs_uar.cpp | 197 + .../doca-gpunetio/src/doca_verbs_uar.hpp | 109 + .../doca-gpunetio/src/doca_verbs_umem.cpp | 212 + .../doca-gpunetio/src/doca_verbs_umem.hpp | 118 + src/transport/gdaki/gin_host_gdaki.cc | 1065 +++ src/transport/gdaki/gin_host_gdaki.h | 36 + src/transport/net_ib.cc | 664 +- src/transport/net_ib_gin.h | 29 + src/transport/nvls.cc | 64 +- src/transport/p2p.cc | 10 +- 165 files changed, 28241 insertions(+), 497 deletions(-) create mode 100644 src/gin/CMakeLists.txt create mode 100644 src/gin/gin_host.cc create mode 100644 src/gin/gin_host_proxy.cc create mode 100644 src/include/env.h create mode 100644 src/include/gin/gin_host.h create mode 100644 src/include/gin/gin_host_proxy.h create mode 100644 src/include/nccl_device/barrier.h create mode 100644 src/include/nccl_device/gin.h create mode 100644 src/include/nccl_device/gin/gdaki/gin_gdaki.h create mode 100644 src/include/nccl_device/gin/gdaki/gin_gdaki_device_host_common.h create mode 100644 src/include/nccl_device/gin/gin_device_api.h create mode 100644 src/include/nccl_device/gin/gin_device_common.h create mode 100644 src/include/nccl_device/gin/gin_device_host_common.h create mode 100644 src/include/nccl_device/gin/proxy/gin_proxy.h create mode 100644 src/include/nccl_device/gin/proxy/gin_proxy_device_host_common.h create mode 100644 src/include/nccl_device/gin_barrier.h create mode 100644 src/include/nccl_device/impl/barrier__funcs.h create mode 100644 src/include/nccl_device/impl/barrier__types.h create mode 100644 src/include/nccl_device/impl/gin__funcs.h create mode 100644 src/include/nccl_device/impl/gin__types.h create mode 100644 src/include/nccl_device/impl/gin_barrier__funcs.h create mode 100644 src/include/nccl_device/impl/gin_barrier__types.h rename src/include/nccl_device/impl/{mem_barrier__funcs.h => lsa_barrier__funcs.h} (99%) rename src/include/nccl_device/impl/{mem_barrier__types.h => lsa_barrier__types.h} (98%) rename src/include/nccl_device/{mem_barrier.h => lsa_barrier.h} (100%) rename src/include/{ => nccl_device}/net_device.h (89%) create mode 100644 src/include/plugin/env/env_v1.h create mode 100644 src/include/plugin/nccl_env.h create mode 100644 src/libnccl.map create mode 100644 src/nccl_device/gin_barrier.cc rename src/nccl_device/{mem_barrier.cc => lsa_barrier.cc} (94%) create mode 100644 src/plugin/env.cc create mode 100644 src/plugin/env/CMakeLists.txt create mode 100644 src/plugin/env/env_v1.cc create mode 100644 src/transport/gdaki/CMakeLists.txt create mode 100644 src/transport/gdaki/doca-gpunetio/include/common/doca_gpunetio_verbs_def.h create mode 100644 src/transport/gdaki/doca-gpunetio/include/common/doca_gpunetio_verbs_dev.h create mode 100644 src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_common.cuh create mode 100644 src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_counter.cuh create mode 100644 src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_cq.cuh create mode 100644 src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_onesided.cuh create mode 100644 src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_qp.cuh create mode 100644 src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_config.h create mode 100644 src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_device.h create mode 100644 src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_host.h create mode 100644 src/transport/gdaki/doca-gpunetio/include/host/doca_error.h create mode 100644 src/transport/gdaki/doca-gpunetio/include/host/doca_gpunetio.h create mode 100644 src/transport/gdaki/doca-gpunetio/include/host/doca_gpunetio_high_level.h create mode 100644 src/transport/gdaki/doca-gpunetio/include/host/doca_verbs.h create mode 100644 src/transport/gdaki/doca-gpunetio/include/host/mlx5_ifc.h create mode 100644 src/transport/gdaki/doca-gpunetio/include/host/mlx5_prm.h create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_gpunetio.cpp create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_gdrcopy.cpp create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_gdrcopy.h create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_high_level.cpp create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_log.cpp create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_log.hpp create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_internal.hpp create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_cq.cpp create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_cq.hpp create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_cuda_wrapper.cpp create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_cuda_wrapper.h create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_device_attr.cpp create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_device_attr.hpp create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_ibv_wrapper.cpp create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_ibv_wrapper.h create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_mlx5dv_wrapper.cpp create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_mlx5dv_wrapper.h create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_net_wrapper.h create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_qp.cpp create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_qp.hpp create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_srq.cpp create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_srq.hpp create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_uar.cpp create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_uar.hpp create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_umem.cpp create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_umem.hpp create mode 100644 src/transport/gdaki/gin_host_gdaki.cc create mode 100644 src/transport/gdaki/gin_host_gdaki.h create mode 100644 src/transport/net_ib_gin.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 1941cdafe..d1eb6cc8e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -148,6 +148,10 @@ if(MAX_EXT_NET_PLUGINS GREATER 0) add_definitions(-DNCCL_NET_MAX_PLUGINS=${MAX_EXT_NET_PLUGINS}) endif() +add_definitions(-DDOCA_VERBS_USE_CUDA_WRAPPER) +add_definitions(-DDOCA_VERBS_USE_NET_WRAPPER) +add_definitions(-DNCCL_GIN_PROXY_ENABLE=1) + # Library dependencies find_library(RT_LIBRARY NAMES rt) if(RT_LIBRARY) diff --git a/ext-tuner/example/plugin.c b/ext-tuner/example/plugin.c index af813495a..d9b84b884 100644 --- a/ext-tuner/example/plugin.c +++ b/ext-tuner/example/plugin.c @@ -308,7 +308,7 @@ __hidden ncclResult_t pluginInit(void** context, uint64_t commId, size_t nRanks, // Set NVLSTree base network latency to 24us constants->hwLatencies[NCCL_HW_NET][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = 24.0; } - + TunerContext* ctx = (TunerContext*)malloc(sizeof(TunerContext)); if (!ctx) return ncclSystemError; diff --git a/ext-tuner/example/test/test_plugin.c b/ext-tuner/example/test/test_plugin.c index c0300d51c..328762519 100644 --- a/ext-tuner/example/test/test_plugin.c +++ b/ext-tuner/example/test/test_plugin.c @@ -767,16 +767,16 @@ int test_nvl_domain_info() { .minRanksPerNvlDomain = 3, // minimum ranks across all domains (bottleneck) .maxRanksPerNvlDomain = 5 // maximum ranks across all domains (capacity) }; - + void* context = NULL; ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, &nvl_domain, NULL); TEST_ASSERT(result == ncclSuccess, "Plugin init with NVLink domains should succeed"); - + // Validate NVLD info structure TEST_ASSERT(nvl_domain.nNvlDomains == 2, "Should have 2 domains (nodes)"); TEST_ASSERT(nvl_domain.minRanksPerNvlDomain == 3, "Should have minimum 3 ranks per domain"); TEST_ASSERT(nvl_domain.maxRanksPerNvlDomain == 5, "Should have maximum 5 ranks per domain"); - + // Clean up pluginFinalize(context); printf("NVLink domain info test passed!\n"); diff --git a/makefiles/common.mk b/makefiles/common.mk index f8f455dec..2b1d1c4b3 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -20,7 +20,7 @@ NET_PROFILER ?= 0 MLX5DV ?= 0 MAX_EXT_NET_PLUGINS ?= 0 -NVCC = $(CUDA_HOME)/bin/nvcc +NVCC ?= $(CUDA_HOME)/bin/nvcc CUDA_LIB ?= $(CUDA_HOME)/lib64 CUDA_INC ?= $(CUDA_HOME)/include @@ -85,6 +85,8 @@ NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) $(CXXSTD) --expt-extended-lambda -Xp # Use addprefix so that we can specify more than one path NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt +NVCUFLAGS_SYM := + ########## GCOV ########## GCOV ?= 0 # disable by default. GCOV_FLAGS := $(if $(filter 0,${GCOV} ${DEBUG}),,--coverage) # only gcov=1 and debug =1 @@ -158,3 +160,8 @@ endif ifneq ($(MAX_EXT_NET_PLUGINS), 0) CXXFLAGS += -DNCCL_NET_MAX_PLUGINS=$(MAX_EXT_NET_PLUGINS) endif + +CXXFLAGS += -DDOCA_VERBS_USE_CUDA_WRAPPER -DDOCA_VERBS_USE_NET_WRAPPER +NVCUFLAGS += -DDOCA_VERBS_USE_CUDA_WRAPPER -DDOCA_VERBS_USE_NET_WRAPPER + +CXXFLAGS += -DNCCL_GIN_PROXY_ENABLE=1 diff --git a/makefiles/version.mk b/makefiles/version.mk index d0e97c065..7bb671ddf 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 28 -NCCL_PATCH := 3 +NCCL_PATCH := 7 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5ab69dc92..b48ed1880 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -39,6 +39,7 @@ add_subdirectory(device) add_subdirectory(nccl_device) add_subdirectory(ras) add_subdirectory(scheduler) +add_subdirectory(gin) add_compile_options(-fmacro-prefix-map=${CMAKE_CURRENT_SOURCE_DIR}/=) @@ -52,6 +53,8 @@ list(APPEND LIBSRCFILES ${RAS_SOURCES} ${SYM_SOURCES} ${SCHEDULER_SOURCES} + ${GIN_SOURCES} + ${DOCA_SOURCES} ) ###################### Create a shared NCCL library ############################ @@ -65,6 +68,7 @@ target_include_directories(nccl PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/include/plugin ${CUDAToolkit_INCLUDE_DIRS} + ${DOCA_HOME}/include ${CUDAToolkit_INCLUDE_DIRS}/cccl ) @@ -80,9 +84,25 @@ add_custom_command( BYPRODUCTS ${CMAKE_BINARY_DIR}/include/nccl.h ) -add_custom_target(nccl_header DEPENDS ${CMAKE_BINARY_DIR}/include/nccl.h) +file(GLOB_RECURSE SRC_DEVICE_HEADERS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/include/nccl_device/*.h) + +# Copy all device header files to the destination +foreach(HEADER_FILE ${SRC_DEVICE_HEADERS}) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${HEADER_FILE} ${CMAKE_BINARY_DIR}/${HEADER_FILE} COPYONLY) + list(APPEND DEVICE_HEADERS ${CMAKE_BINARY_DIR}/${HEADER_FILE}) +endforeach() + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/nccl_device.h ${CMAKE_BINARY_DIR}/include/nccl_device.h COPYONLY) + +add_custom_target(nccl_header DEPENDS + ${CMAKE_BINARY_DIR}/include/nccl.h + ${CMAKE_BINARY_DIR}/include/nccl_device.h + ${DEVICE_HEADERS} + ${DEVICE_DOCA_HEADERS} +) add_dependencies(nccl nccl_header) +add_dependencies(nccl_device nccl_header) # Set version and output name set_target_properties(nccl PROPERTIES @@ -111,6 +131,11 @@ target_link_libraries(nccl ${EXTRA_LIBS} ) +# Add version script for symbol visibility control +target_link_options(nccl PRIVATE + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/libnccl.map" +) + # Set output directories for nccl shared library set_target_properties(nccl PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" @@ -149,6 +174,7 @@ target_include_directories(nccl_static PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/include/plugin ${CUDAToolkit_INCLUDE_DIRS} + transport/gdaki/doca-gpunetio/include ${CUDAToolkit_INCLUDE_DIRS}/cccl ) diff --git a/src/Makefile b/src/Makefile index be026cc26..471a0335e 100644 --- a/src/Makefile +++ b/src/Makefile @@ -8,7 +8,7 @@ include ../makefiles/version.mk ##### src files INCEXPORTS := nccl.h nccl_device.h \ - $(patsubst include/%,%,$(wildcard include/nccl_device/*.h include/nccl_device/impl/*.h)) + $(patsubst include/%,%,$(wildcard include/nccl_device/*.h include/nccl_device/*/*.h include/nccl_device/*/*/*.h)) LIBSRCFILES := \ bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \ @@ -16,13 +16,16 @@ LIBSRCFILES := \ $(wildcard graph/*.cc) \ $(wildcard misc/*.cc) \ $(wildcard transport/*.cc) \ + $(wildcard transport/gdaki/*.cc) \ $(wildcard register/*.cc) \ $(wildcard plugin/*.cc) \ $(wildcard plugin/net/*.cc) \ $(wildcard plugin/tuner/*.cc) \ $(wildcard plugin/profiler/*.cc) \ + $(wildcard plugin/env/*.cc) \ $(wildcard nccl_device/*.cc) \ $(wildcard scheduler/*.cc) \ + $(wildcard gin/*.cc) \ $(filter-out ras/client.cc,$(wildcard ras/*.cc)) BINSRCFILES := ras/client.cc @@ -40,6 +43,7 @@ LIBDIR := $(BUILDDIR)/lib OBJDIR := $(BUILDDIR)/obj PKGDIR := $(BUILDDIR)/lib/pkgconfig BINDIR := $(BUILDDIR)/bin + ##### target files CUDARTLIB ?= cudart_static @@ -61,6 +65,17 @@ INCPLUGIN := include/plugin DEVMANIFEST := $(BUILDDIR)/obj/device/manifest +# DOCA GPUNetIO definitions +DOCA_HOME ?= transport/gdaki/doca-gpunetio +DOCA_INC_INSTALL := $(INCDIR)/nccl_device/gin/gdaki/doca_gpunetio +DOCA_OBJDIR := $(OBJDIR)/transport/gdaki/doca-gpunetio +DOCA_INCLUDES := $(DOCA_HOME)/include/doca_gpunetio_device.h $(wildcard $(DOCA_HOME)/include/common/*.h) $(wildcard $(DOCA_HOME)/include/device/*.cuh) +DOCA_INCTARGETS := $(DOCA_INCLUDES:$(DOCA_HOME)/include/%=$(DOCA_INC_INSTALL)/%) +INCTARGETS += $(DOCA_INCTARGETS) +DOCA_LIBSRC := doca_verbs_qp.cpp doca_verbs_cq.cpp doca_verbs_device_attr.cpp doca_verbs_umem.cpp doca_verbs_srq.cpp doca_verbs_uar.cpp doca_gpunetio.cpp doca_gpunetio_log.cpp doca_gpunetio_high_level.cpp doca_verbs_cuda_wrapper.cpp doca_verbs_mlx5dv_wrapper.cpp doca_verbs_ibv_wrapper.cpp doca_gpunetio_gdrcopy.cpp +DOCA_LIBOBJ := $(DOCA_LIBSRC:%.cpp=$(DOCA_OBJDIR)/%.o) +LIBOBJ += $(DOCA_LIBOBJ) + ##### rules build : lib staticlib binary @@ -94,7 +109,7 @@ $(INCDIR)/nccl.h : nccl.h.in ../makefiles/version.mk $(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVMANIFEST) @printf "Linking %-35s > %s\n" $(LIBTARGET) $@ mkdir -p $(LIBDIR) - $(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $$(cat $(DEVMANIFEST)) $(LDFLAGS) + $(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $$(cat $(DEVMANIFEST)) $(LDFLAGS) -Wl,--version-script=libnccl.map ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME) ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME) @@ -137,6 +152,36 @@ $(INCDIR)/nccl_device/impl/%.h: include/nccl_device/impl/%.h mkdir -p $(INCDIR)/nccl_device/impl install -m 644 $< $@ +$(INCDIR)/nccl_device/gin/%.h: include/nccl_device/gin/%.h + @printf "Grabbing %-35s > %s\n" $< $@ + mkdir -p $(INCDIR)/nccl_device/gin + install -m 644 $< $@ + +$(INCDIR)/nccl_device/gin/gdaki/%.h: include/nccl_device/gin/gdaki/%.h + @printf "Grabbing %-35s > %s\n" $< $@ + mkdir -p $(INCDIR)/nccl_device/gin/gdaki + install -m 644 $< $@ + +$(INCDIR)/nccl_device/gin/proxy/%.h: include/nccl_device/gin/proxy/%.h + @printf "Grabbing %-35s > %s\n" $< $@ + mkdir -p $(INCDIR)/nccl_device/gin/proxy + install -m 644 $< $@ + +$(DOCA_INC_INSTALL)/%.h: $(DOCA_HOME)/include/%.h + @printf "Grabbing %-35s > %s\n" $< $@ + mkdir -p $(DOCA_INC_INSTALL) + install -m 644 $< $@ + +$(DOCA_INC_INSTALL)/common/%.h: $(DOCA_HOME)/include/common/%.h + @printf "Grabbing %-35s > %s\n" $< $@ + mkdir -p $(DOCA_INC_INSTALL)/common + install -m 644 $< $@ + +$(DOCA_INC_INSTALL)/device/%.cuh: $(DOCA_HOME)/include/device/%.cuh + @printf "Grabbing %-35s > %s\n" $< $@ + mkdir -p $(DOCA_INC_INSTALL)/device + install -m 644 $< $@ + $(PKGDIR)/%.pc : %.pc @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(PKGDIR) @@ -145,8 +190,18 @@ $(PKGDIR)/%.pc : %.pc $(OBJDIR)/%.o : %.cc $(INCTARGETS) @printf "Compiling %-35s > %s\n" $< $@ mkdir -p `dirname $@` - $(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -c $< -o $@ - @$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -M $< > $(@:%.o=%.d.tmp) + $(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -I$(DOCA_HOME)/include -c $< -o $@ + @$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -I$(DOCA_HOME)/include -M $< > $(@:%.o=%.d.tmp) + @sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d) + @sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \ + sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d) + @rm -f $(@:%.o=%.d.tmp) + +$(DOCA_OBJDIR)/%.o : $(DOCA_HOME)/src/%.cpp + @printf "Compiling %-35s > %s\n" $< $@ + mkdir -p `dirname $@` + $(CXX) -I$(DOCA_HOME)/src -I$(DOCA_HOME)/include $(CXXFLAGS) -c $< -o $@ + @$(CXX) -I$(DOCA_HOME)/src -I$(DOCA_HOME)/include $(CXXFLAGS) -M $< > $(@:%.o=%.d.tmp) @sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d) @sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \ sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d) diff --git a/src/bootstrap.cc b/src/bootstrap.cc index 7615b9c52..ff4a8eb24 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -226,6 +226,21 @@ static ncclResult_t socketSendRecv(struct ncclSocket* sendSock, void* sendData, return ncclSuccess; } +static ncclResult_t socketDoubleSendRecv(struct ncclSocketOp ops[4]) { + // ops synchronously exchange size then asynchronously exchange data in send->recv->send->recv order + int senderRecvSize1, senderRecvSize2; + NCCLCHECK(ncclSocketSendRecv(ops[0].sock, &ops[0].size, sizeof(int), ops[1].sock, &senderRecvSize1, sizeof(int))); + NCCLCHECK(ncclSocketSendRecv(ops[2].sock, &ops[2].size, sizeof(int), ops[3].sock, &senderRecvSize2, sizeof(int))); + if (senderRecvSize1 > ops[1].size || senderRecvSize2 > ops[3].size) { + WARN("Message truncated : received %d,%d bytes instead of %d,%d", senderRecvSize1, senderRecvSize2, ops[1].size, ops[3].size); + return ncclInternalError; + } + ops[1].size = std::min(ops[1].size, senderRecvSize1); + ops[3].size = std::min(ops[3].size, senderRecvSize2); + NCCLCHECK(ncclSocketMultiOp(ops, 4)); + return ncclSuccess; +} + union ringConnectInfo { union ncclSocketAddress addr; char handle[NCCL_NET_HANDLE_MAXSIZE]; @@ -1007,22 +1022,40 @@ static ncclResult_t netRingAllGather(ncclNet_t* net, void* sendComm, void* recvC if (recvDataHandle) netDereg(net, recvComm, &recvDataHandle); return res; } -static ncclResult_t socketRingAllGather(struct ncclSocket* sendSock, struct ncclSocket* recvSock, int rank, int nranks, char* data, int size) { +static ncclResult_t socketRingAllGather(struct ncclSocket* nextSock, struct ncclSocket* prevSock, int rank, int nranks, char* data, int size) { ncclResult_t res = ncclSuccess; uint64_t tFirst = 0, tRest = 0; /* Simple ring based AllGather * At each step i receive data from (rank-i-1) from prev * and send previous step's data from (rank-i) to next */ - TRACE(NCCL_BOOTSTRAP, "socketRingAllGather started"); + TRACE(NCCL_BOOTSTRAP, "socketRingAllGather started: rank=%d nranks=%d", rank, nranks); + int totalSteps = nranks / 2; + TRACE(NCCL_BOOTSTRAP, "bidirectional bootstrap: totalSteps=%d", totalSteps); BOOTSTRAP_PROF_OPEN(tFirst); - for (int i = 0; i < nranks - 1; i++) { - size_t rslice = (rank - i - 1 + nranks) % nranks; - size_t sslice = (rank - i + nranks) % nranks; - void* recv_data = data + rslice * size; - void* send_data = data + sslice * size; - NCCLCHECKGOTO(socketSendRecv(sendSock, send_data, size, recvSock, recv_data, size), res, exit); - if (i == 0) { + for (int step = 0; step < totalSteps; step++) { + // N ranks requires (N-1)/2 steps for the double ring algorithm. If N is even, the last step is requires a single send/recv + bool isFinalUnidirectional = (step == totalSteps - 1) && (nranks % 2 == 0); + // Ring0: ring from previous to next + int sendSliceRing0 = (rank - step + nranks) % nranks; // Send this slice to next neighbor + int recvSliceRing0 = (rank - step - 1 + nranks) % nranks; // Receive this slice from prev neighbor + // Ring1: ring from next to previous + int sendSliceRing1 = (rank + step) % nranks; // Send this slice to prev neighbor + int recvSliceRing1 = (rank + step + 1) % nranks; // Receive this slice from next neighbor + if (isFinalUnidirectional) { + // Final unidirectional step, only Ring0 is used + NCCLCHECKGOTO(socketSendRecv(nextSock, data + sendSliceRing0 * size, size, prevSock, data + recvSliceRing0 * size, size), res, exit); + } else { + // Bidirectional step: Ring0 and Ring1 are used simultaneously + struct ncclSocketOp ops[4] = { + {NCCL_SOCKET_SEND, nextSock, data + sendSliceRing0 * size, size, 0}, // Ring0: send to next + {NCCL_SOCKET_RECV, prevSock, data + recvSliceRing0 * size, size, 0}, // Ring0: recv from prev + {NCCL_SOCKET_SEND, prevSock, data + sendSliceRing1 * size, size, 0}, // Ring1: send to prev + {NCCL_SOCKET_RECV, nextSock, data + recvSliceRing1 * size, size, 0} // Ring1: recv from next + }; + NCCLCHECKGOTO(socketDoubleSendRecv(ops), res, exit); + } + if (step == 0) { BOOTSTRAP_PROF_CLOSE(tFirst); BOOTSTRAP_PROF_OPEN(tRest); } diff --git a/src/ce_coll.cc b/src/ce_coll.cc index 3f3dcbd7f..b2bf32b8b 100644 --- a/src/ce_coll.cc +++ b/src/ce_coll.cc @@ -55,13 +55,13 @@ ncclResult_t ncclCeInit(struct ncclComm* comm) { ncclResult_t ncclCeFinalize(struct ncclComm* comm) { ncclResult_t ret = ncclSuccess; - + // Clean up ceInitTaskQueue while (!ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) { struct ncclCeInitTask* task = ncclIntruQueueDequeue(&comm->ceInitTaskQueue); free(task); } - + // Clean up CE resources if (comm->ceColl.baseUCSymReadyPtr != NULL) { if (comm->ceColl.ceSyncWin && comm->ceColl.ceSyncWin->vidmem) { @@ -117,7 +117,7 @@ ncclResult_t ncclPrepMCSync(struct ncclComm* comm, bool isComplete, CUstreamBatc void* dstPtr = isComplete ? (void*)&completePtrs[comm->rank] : (void*)&readyPtrs[comm->rank]; size_t offset = (uint8_t*)dstPtr - (uint8_t*)comm->ceColl.ceSyncWin->userPtr; NCCLCHECKGOTO(ncclDevrGetLsaTeamPtrMC(comm, comm->ceColl.ceSyncWin, offset, ncclTeamLsa(comm), &mcDstPtr), ret, fail); - + // Write our own ready/complete flag to the multi-cast address CUDACHECKGOTO(cudaMemcpyAsync( mcDstPtr, @@ -194,7 +194,7 @@ ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) { // Get pointers to the ready and complete synchronization arrays uint32_t* readyPtrs = (uint32_t*)comm->ceColl.baseUCSymReadyPtr; uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr; - + // Allocate enough slots for all possible ops size_t batchSize = (comm->nvlsSupport ? NCCL_CE_SYNC_OPS_PER_RANK_MC : NCCL_CE_SYNC_OPS_PER_RANK_UC) * comm->nRanks; size_t opIdx = 0; @@ -220,7 +220,7 @@ ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) { opIdx++; } } - + // Execute all memory operations in a single batch CUCHECKGOTO(cuStreamBatchMemOp(stream, opIdx, batchParams, 0), ret, fail); @@ -236,7 +236,7 @@ ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) { ncclResult_t ncclCeInitBatchOpsParams(struct ncclCeBatchOpsParams* params, int nRanks) { ncclResult_t ret = ncclSuccess; - + params->srcs = nullptr; params->dsts = nullptr; params->sizes = nullptr; @@ -247,7 +247,7 @@ ncclResult_t ncclCeInitBatchOpsParams(struct ncclCeBatchOpsParams* params, int n params->attrIdxs = nullptr; params->numAttrs = 0; #endif - + NCCLCHECKGOTO(ncclCalloc(¶ms->srcs, nRanks), ret, fail); NCCLCHECKGOTO(ncclCalloc(¶ms->dsts, nRanks), ret, fail); NCCLCHECKGOTO(ncclCalloc(¶ms->sizes, nRanks), ret, fail); @@ -284,7 +284,7 @@ ncclResult_t ncclCeLaunchBatchOps(struct ncclComm* comm, struct ncclCeBatchOpsPa int driverVersion; NCCLCHECKGOTO(ncclCudaDriverVersion(&driverVersion), ret, fail); - + //--------------Graph capture-------------- // cudaMemcpyBatchAsync is not supported during CUDA graph capture if (capturing) { @@ -372,7 +372,7 @@ ncclResult_t ncclCeLaunchBatchOps(struct ncclComm* comm, struct ncclCeBatchOpsPa ncclResult_t ncclCeAllGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) { ncclResult_t ret = ncclSuccess; - + // Calculate the size of each rank's data chunk const size_t chunkBytes = args->nElts * args->eltSize; uint8_t* mySendBuff = (uint8_t*)args->sendBuff; @@ -423,7 +423,7 @@ ncclResult_t ncclCeAllGather(struct ncclComm* comm, struct ncclCeCollArgs* args, ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) { ncclResult_t ret = ncclSuccess; - + // Calculate the size of data each rank sends to every other rank const size_t chunkBytes = args->nElts * args->eltSize; uint8_t* mySendBuff = (uint8_t*)args->sendBuff; @@ -442,7 +442,7 @@ ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args, int dstRank = (comm->rank + r) % comm->nRanks; uint8_t* srcPtr = mySendBuff + dstRank * chunkBytes; uint8_t* dstPtr = myRecvBuff + comm->rank * chunkBytes; - + if (dstRank == comm->rank) { // Local copy for own data batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr; @@ -478,7 +478,7 @@ ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args, ncclResult_t ncclCeScatter(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) { ncclResult_t ret = ncclSuccess; - + // Calculate the size of data root sends to each rank const size_t chunkBytes = args->nElts * args->eltSize; uint8_t* mySendBuff = (uint8_t*)args->sendBuff; @@ -538,7 +538,7 @@ ncclResult_t ncclCeScatter(struct ncclComm* comm, struct ncclCeCollArgs* args, c ncclResult_t ncclCeGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) { ncclResult_t ret = ncclSuccess; - + // Calculate the size of data each rank sends to root const size_t chunkBytes = args->nElts * args->eltSize; uint8_t* mySendBuff = (uint8_t*)args->sendBuff; diff --git a/src/debug.cc b/src/debug.cc index 0d6ed8400..9e2bfa459 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -16,6 +16,7 @@ #include #include "param.h" #include +#include "env.h" #define NCCL_DEBUG_RESET_TRIGGERED (-2) @@ -37,9 +38,12 @@ static bool ncclWarnSetDebugInfo = false; static __thread int tid = -1; +typedef const char* (*ncclGetEnvFunc_t)(const char*); + // This function must be called with ncclDebugLock locked! static void ncclDebugInit() { - const char* nccl_debug = ncclGetEnv("NCCL_DEBUG"); + ncclGetEnvFunc_t getEnvFunc = ncclEnvPluginInitialized() ? ncclGetEnv : (ncclGetEnvFunc_t)getenv; + const char* nccl_debug = getEnvFunc("NCCL_DEBUG"); int tempNcclDebugLevel = -1; uint64_t tempNcclDebugMask = NCCL_INIT | NCCL_BOOTSTRAP | NCCL_ENV; // Default debug sub-system mask if (ncclDebugLevel == NCCL_DEBUG_RESET_TRIGGERED && ncclDebugFile != stdout) { @@ -47,6 +51,7 @@ static void ncclDebugInit() { fclose(ncclDebugFile); ncclDebugFile = stdout; } + if (nccl_debug == NULL) { tempNcclDebugLevel = NCCL_LOG_NONE; } else if (strcasecmp(nccl_debug, "VERSION") == 0) { @@ -65,7 +70,7 @@ static void ncclDebugInit() { * This can be a comma separated list such as INIT,COLL * or ^INIT,COLL etc */ - const char* ncclDebugSubsysEnv = ncclGetEnv("NCCL_DEBUG_SUBSYS"); + const char* ncclDebugSubsysEnv = getEnvFunc("NCCL_DEBUG_SUBSYS"); if (ncclDebugSubsysEnv != NULL) { int invert = 0; if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; } @@ -117,7 +122,7 @@ static void ncclDebugInit() { free(ncclDebugSubsys); } - const char* ncclWarnSetDebugInfoEnv = ncclGetEnv("NCCL_WARN_ENABLE_DEBUG_INFO"); + const char* ncclWarnSetDebugInfoEnv = getEnvFunc("NCCL_WARN_ENABLE_DEBUG_INFO"); if (ncclWarnSetDebugInfoEnv != NULL && strlen(ncclWarnSetDebugInfoEnv) > 0) { int64_t value; errno = 0; @@ -127,7 +132,7 @@ static void ncclDebugInit() { } // Determine which debug levels will have timestamps. - const char* timestamps = ncclGetEnv("NCCL_DEBUG_TIMESTAMP_LEVELS"); + const char* timestamps = getEnvFunc("NCCL_DEBUG_TIMESTAMP_LEVELS"); if (timestamps == nullptr) { ncclDebugTimestampLevels = (1< VERSION */ - const char* ncclDebugFileEnv = ncclGetEnv("NCCL_DEBUG_FILE"); + const char* ncclDebugFileEnv = getEnvFunc("NCCL_DEBUG_FILE"); if (tempNcclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) { int c = 0; char debugFn[PATH_MAX+1] = ""; diff --git a/src/dev_runtime.cc b/src/dev_runtime.cc index 54e6e01bf..60cb200aa 100644 --- a/src/dev_runtime.cc +++ b/src/dev_runtime.cc @@ -18,8 +18,11 @@ struct ncclDevrMemory { int refCount; struct ncclDevrMemory* next; CUmemGenericAllocationHandle memHandle; + void* primaryAddr; // What we hope is the VA of this memory's first mapping. size_t size; size_t bigOffset; // offset in big VA space + void* ginHostWins[NCCL_GIN_MAX_CONTEXTS]; + ncclGinWindow_t ginDevWins[NCCL_GIN_MAX_CONTEXTS]; }; struct ncclDevrWindowSorted { @@ -56,12 +59,21 @@ ncclResult_t ncclDevrInitOnce(struct ncclComm* comm) { struct ncclDevrState* devr = &comm->devrState; if (devr->bigSize != 0) return ncclSuccess; - bool lsaIsLocal = true; - for (int i=0; i < comm->localRanks; i++) { - lsaIsLocal &= comm->localRankToRank[i] == comm->localRankToRank[0] + i; + // LSA needs to be the same size for all ranks, and it needs to represent + // a consecutive set of ranks. + int lsaSize = 0; + int nodeSize = 1; + for (int r=1; r < comm->nRanks; r++) { + if (comm->rankToNode[r] == comm->rankToNode[r-1]) { + nodeSize += 1; + } else { + lsaSize = gcd(lsaSize, nodeSize); + nodeSize = 1; + } } - devr->lsaSelf = lsaIsLocal ? comm->localRank : 0; - devr->lsaSize = lsaIsLocal ? comm->localRanks : 1; + lsaSize = gcd(lsaSize, nodeSize); + devr->lsaSize = lsaSize; + devr->lsaSelf = comm->rank % lsaSize; devr->lsaRankList = (int*)malloc(devr->lsaSize*sizeof(int)); for (int i=0; i < devr->lsaSize; i++) { devr->lsaRankList[i] = comm->rank + (i - devr->lsaSelf); @@ -83,7 +95,7 @@ ncclResult_t ncclDevrInitOnce(struct ncclComm* comm) { } devr->bigSize = alignUp(devr->bigSize, size_t(1)<<32); INFO(NCCL_INIT, "Symmetric VA size=%ldGB", (long)devr->bigSize>>30); - + ncclSpaceConstruct(&devr->bigSpace); ncclShadowPoolConstruct(&devr->shadows); return ncclSuccess; @@ -103,7 +115,7 @@ ncclResult_t ncclDevrFinalize(struct ncclComm* comm) { struct ncclDevrRegTask* task = ncclIntruQueueDequeue(&devr->regTaskQueue); free(task); } - + symTeamDestroyAll(comm); { // delete windowTable cudaStream_t stream; @@ -336,11 +348,17 @@ static void symTeamDestroyAll(struct ncclComm* comm) { } } +static ncclResult_t symMemoryRegisterGin(struct ncclComm* comm, struct ncclDevrMemory* mem) { + NCCLCHECK(ncclGinConnectOnce(comm)); + NCCLCHECK(ncclGinRegister(comm, mem->primaryAddr, mem->size, mem->ginHostWins, mem->ginDevWins)); + return ncclSuccess; +} + // On success we take caller's reference on memHandle. // Due to multicast binds for each pre-exiting team, this function requires // caller do a world barrier before returning to user. static ncclResult_t symMemoryObtain( - struct ncclComm* comm, CUmemGenericAllocationHandle memHandle, size_t size, + struct ncclComm* comm, CUmemGenericAllocationHandle memHandle, void* memAddr, size_t size, struct ncclDevrMemory** outMem ) { ncclResult_t ret = ncclSuccess; @@ -355,12 +373,14 @@ static ncclResult_t symMemoryObtain( } mem = mem->next; } + // New memory. mem = (struct ncclDevrMemory*)malloc(sizeof(struct ncclDevrMemory)); mem->refCount = 0; mem->memHandle = memHandle; + mem->primaryAddr = memAddr; mem->size = size; - + // Grab offset in the big space. NCCLCHECKGOTO(ncclSpaceAlloc(&devr->bigSpace, devr->bigSize, size, devr->granularity, &bigOffset), ret, fail_mem); mem->bigOffset = bigOffset; @@ -368,10 +388,20 @@ static ncclResult_t symMemoryObtain( // Map unicast addresses into flat VA space for lsa team. NCCLCHECKGOTO(symMemoryMapLsaTeam(comm, memHandle, size, bigOffset), ret, fail_mem_space); + // If our caller doesn't have a VA then we'll use the LSA mapping. + if (mem->primaryAddr == nullptr) { + mem->primaryAddr = (char*)devr->lsaFlatBase + devr->lsaSelf*devr->bigSize + mem->bigOffset; + } + // Bind new memory with each existing team. for (struct ncclDevrTeam* t = devr->teamHead; t != nullptr; t = t->next) { NCCLCHECKGOTO(symBindTeamMemory(comm, t, mem), ret, fail_mem_space_teams); } + + if (devr->ginEnabled) { + NCCLCHECKGOTO(symMemoryRegisterGin(comm, mem), ret, fail_mem_space_teams); + } + // Add to list of mems. mem->next = devr->memHead; devr->memHead = mem; @@ -398,6 +428,9 @@ static void symMemoryDropRef( ) { if (mem != nullptr && 0 == --mem->refCount) { struct ncclDevrState* devr = &comm->devrState; + if (devr->ginEnabled) { + ncclGinDeregister(comm, mem->ginHostWins); + } for (struct ncclDevrTeam* t = devr->teamHead; t != nullptr; t = t->next) { symUnbindTeamMemory(comm, t, mem); } @@ -461,18 +494,22 @@ static ncclResult_t symWindowCreate( winDevHost->lsaRank = devr->lsaSelf; winDevHost->worldRank = comm->rank; winDevHost->winHost = (void*)win; + winDevHost->ginOffset4K = memOffset>>12; + for (int i=0; i < NCCL_GIN_MAX_CONTEXTS; i++) { + winDevHost->ginWins[i] = mem->ginDevWins[i]; + } CUDACHECK(cudaMemcpyAsync(winDev, winDevHost, sizeof(struct ncclWindow_vidmem), cudaMemcpyHostToDevice, stream)); NCCLCHECK(symWindowTableInitOnce(comm, stream)); // ensure devr->windowTable exists struct ncclDevCommWindowTable* tableDev = devr->windowTable; - struct ncclDevCommWindowTable* tableHost; - NCCLCHECK(ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost)); while (true) { + struct ncclDevCommWindowTable* tableHost; + NCCLCHECK(ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost)); int i = 0; while (i < 32 && tableHost->entries[i].window != nullptr) i += 1; if (i < 32) { tableHost->entries[i].base = userAddr; - tableHost->entries[i].size = userAddr + userSize; + tableHost->entries[i].size = userSize; tableHost->entries[i].window = winDev; CUDACHECK(cudaMemcpyAsync(&tableDev->entries[i], &tableHost->entries[i], sizeof(tableHost->entries[i]), cudaMemcpyHostToDevice, stream)); break; @@ -482,7 +519,6 @@ static ncclResult_t symWindowCreate( CUDACHECK(cudaMemcpyAsync(&tableDev->next, &tableHost->next, sizeof(tableHost->next), cudaMemcpyHostToDevice, stream)); } tableDev = tableHost->next; - NCCLCHECK(ncclShadowPoolToHost(&devr->shadows, tableHost->next, &tableHost)); } { // insert into winSorted[] @@ -511,9 +547,9 @@ static ncclResult_t symWindowDestroy(struct ncclComm* comm, struct ncclWindow_vi symMemoryDropRef(comm, winHost->memory); { struct ncclDevCommWindowTable* tableDev = devr->windowTable; - struct ncclDevCommWindowTable* tableHost; - NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost), ret, remove_winSorted); while (true) { + struct ncclDevCommWindowTable* tableHost; + NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost), ret, remove_winSorted); int i = 0; while (i < 32 && tableHost->entries[i].window != winDev) i += 1; if (i < 32) { @@ -523,7 +559,6 @@ static ncclResult_t symWindowDestroy(struct ncclComm* comm, struct ncclWindow_vi } if (tableHost->next == nullptr) break; // Error didn't find window in table tableDev = tableHost->next; - NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, tableHost->next, &tableHost), ret, remove_winSorted); } } NCCLCHECKGOTO(ncclShadowPoolFree(&devr->shadows, winDev, stream), ret, remove_winSorted); @@ -578,7 +613,7 @@ ncclResult_t ncclDevrWindowRegisterInGroup( CUCHECKGOTO(cuMemRetainAllocationHandle(&memHandle, reinterpret_cast(memAddr)), ret, fail_locReg); // Trade cumem handle for ncclDevrMemory* - NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, memSize, &mem), ret, fail_locReg_memHandle); + NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, (void*)memAddr, memSize, &mem), ret, fail_locReg_memHandle); memHandle = 0x0; // symMemoryObtain took our reference CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), ret, fail); @@ -587,7 +622,7 @@ ncclResult_t ncclDevrWindowRegisterInGroup( comm, mem, memOffset, userPtr, userSize, winFlags, localRegHandle, outWinDev, nullptr, stream ), ret, fail_locReg_memHandle_mem_stream); mem = nullptr; // symWindowCreate took our reference - + CUDACHECKGOTO(cudaStreamSynchronize(stream), ret, fail_locReg_memHandle_mem_stream_win); // symWindowCreate needs barrier. @@ -679,15 +714,35 @@ ncclResult_t ncclDevrCommCreateInternal( struct ncclDevrState* devr = &comm->devrState; struct ncclTeam world = ncclTeamWorld(comm); struct ncclTeam lsa = ncclTeamInnerFactor(world, devr->lsaSize); + bool ginActivated = false; struct ncclDevrTeam* tmLsa; size_t bufSizeTotal; + int nGinContexts = 0; + int ginSignalTotal = 0, ginCounterTotal = 0; struct ncclDevResourceRequirements* resReqsHead; struct ncclDevResourceRequirements lsaBarReq; cudaStream_t stream = nullptr; + struct ncclDevResourceRequirements railGinBarrierReq; CUmemGenericAllocationHandle memHandle = 0x0; struct ncclDevrMemory* mem = nullptr; struct ncclDevrWindow* win = nullptr; struct ncclWindow_vidmem* winHost = nullptr; + size_t ginSignalShadowsOffset = 0; + + if (comm->nNodes > 1 || reqs->ginForceEnable || reqs->ginCounterCount != 0 || reqs->ginSignalCount != 0) { + ginActivated = !devr->ginEnabled; + devr->ginEnabled = true; + } + + if (ginActivated) { + NCCLCHECKGOTO(ncclGinConnectOnce(comm), ret, fail); + // Register all preexisting memories with GIN. Update the windows later when + // we have a stream. + for (struct ncclDevrMemory* mem = devr->memHead; mem != nullptr; mem = mem->next) { + NCCLCHECKGOTO(symMemoryRegisterGin(comm, mem), ret, fail); + } + } + if (devr->ginEnabled) nGinContexts = comm->sharedRes->ginState.ginCommCount; memset(outDevComm, 0, sizeof(*outDevComm)); outDevComm->rank = comm->rank; @@ -713,25 +768,52 @@ ncclResult_t ncclDevrCommCreateInternal( resReqsHead = reqs->resourceRequirementsList; - ncclLsaBarrierCreateRequirement(lsa, reqs->lsaBarrierCount, &outDevComm->lsaBarrier, &lsaBarReq); + ncclLsaBarrierCreateRequirement(lsa, std::max(reqs->barrierCount, reqs->lsaBarrierCount), &outDevComm->lsaBarrier, &lsaBarReq); lsaBarReq.next = resReqsHead; resReqsHead = &lsaBarReq; + ncclGinBarrierCreateRequirement(comm, ncclTeamRail(comm), std::max(reqs->barrierCount, reqs->railGinBarrierCount), &outDevComm->railGinBarrier, &railGinBarrierReq); + railGinBarrierReq.next = resReqsHead; + resReqsHead = &railGinBarrierReq; + { struct ncclDevResourceRequirements* rr = resReqsHead; bufSizeTotal = 0; + ginSignalTotal = reqs->ginSignalCount; + ginCounterTotal = reqs->ginCounterCount; while (rr != nullptr) { bufSizeTotal = alignUp(bufSizeTotal, std::max(128, rr->bufferAlign)); if (rr->outBufferHandle != nullptr) *rr->outBufferHandle = bufSizeTotal/128; + if (rr->outGinSignalStart != nullptr) *rr->outGinSignalStart = ginSignalTotal; + if (rr->outGinCounterStart != nullptr) *rr->outGinCounterStart = ginCounterTotal; bufSizeTotal += rr->bufferSize; + ginSignalTotal += rr->ginSignalCount; + ginCounterTotal += rr->ginCounterCount; rr = rr->next; } + bufSizeTotal= alignUp(bufSizeTotal, 128); + ginSignalShadowsOffset = bufSizeTotal; + bufSizeTotal += nGinContexts*ginSignalTotal*sizeof(uint64_t); // include signal shadows bufSizeTotal = alignUp(bufSizeTotal, devr->granularity); } CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), ret, fail); - NCCLCHECKGOTO(symWindowTableInitOnce(comm, stream), ret, fail); // ensure devr->windowTable exists - outDevComm->windowTable = comm->devrState.windowTable; + if (ginActivated) { + // Now update the GIN handles in all existing windows. Registration of memories happened above. + for (int i=0; i < devr->winSortedCount; i++) { + struct ncclDevrWindow* win = devr->winSorted[i].win; + struct ncclWindow_vidmem* winHost; + NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, win->vidmem, &winHost), ret, fail_stream); + winHost->ginOffset4K = (win->bigOffset - win->memory->bigOffset)>>12; + for (int i=0; i < NCCL_GIN_MAX_CONTEXTS; i++) { + winHost->ginWins[i] = win->memory->ginDevWins[i]; + } + CUDACHECKGOTO(cudaMemcpyAsync(win->vidmem, winHost, sizeof(struct ncclWindow_vidmem), cudaMemcpyHostToDevice, stream), ret, fail_stream); + } + } + + NCCLCHECKGOTO(symWindowTableInitOnce(comm, stream), ret, fail_stream); // ensure devr->windowTable exists + outDevComm->windowTable = devr->windowTable; if (bufSizeTotal == 0) { outDevComm->resourceWindow = nullptr; @@ -741,45 +823,65 @@ ncclResult_t ncclDevrCommCreateInternal( memProp.type = CU_MEM_ALLOCATION_TYPE_PINNED; memProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE; memProp.requestedHandleTypes = ncclCuMemHandleType; + // We have to assume that if GIN is possible it might be requested in the future, + // even on single node. + memProp.allocFlags.gpuDirectRDMACapable = comm->sharedRes->ginState.ncclGin != nullptr ? 1 : 0; memProp.location.id = comm->cudaDev; - CUCHECKGOTO(cuMemCreate(&memHandle, bufSizeTotal, &memProp, 0), ret, fail); + CUCHECKGOTO(cuMemCreate(&memHandle, bufSizeTotal, &memProp, 0), ret, fail_stream); - NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, bufSizeTotal, &mem), ret, fail); + NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, NULL, bufSizeTotal, &mem), ret, fail_stream_mem); memHandle = 0x0; // Reference given to symMemoryObtain NCCLCHECKGOTO(symWindowCreate( // Requires world barrier afterward. comm, mem, /*memOffset=*/0, nullptr, bufSizeTotal, /*winFlags=*/0, /*localReg=*/nullptr, &outDevComm->resourceWindow, &win, - stream), ret, fail); + stream), ret, fail_stream_mem); mem = nullptr; // Reference given to symWindowCreate - NCCLCHECKGOTO(ncclShadowPoolToHost(&comm->devrState.shadows, win->vidmem, &winHost), ret, fail); + NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, win->vidmem, &winHost), ret, fail_stream_mem_win); outDevComm->resourceWindow_inlined = *winHost; + outDevComm->ginSignalShadows = (uint64_t*)add4G((char*)winHost->lsaFlatBase + ginSignalShadowsOffset, winHost->lsaRank*winHost->stride4G); - CUDACHECKGOTO(cudaMemsetAsync(win->userPtr, 0, bufSizeTotal, stream), ret, fail); + CUDACHECKGOTO(cudaMemsetAsync(win->userPtr, 0, bufSizeTotal, stream), ret, fail_stream_mem_win); } - CUDACHECKGOTO(cudaStreamSynchronize(stream), ret, fail); + if (devr->ginEnabled) { + outDevComm->ginContextCount = nGinContexts; + outDevComm->ginSignalCount = ginSignalTotal; + outDevComm->ginCounterCount = ginCounterTotal; + NCCLCHECKGOTO(ncclGinAllocSignalsCounters(comm, + ginSignalTotal, &outDevComm->ginSignalBase, + ginCounterTotal, &outDevComm->ginCounterBase + ), ret, fail_stream_mem_win); + + for (int ctx=0; ctx < nGinContexts; ctx++) { + outDevComm->ginTypes[ctx] = (int)comm->sharedRes->ginState.ginDevHandles[ctx]->netDeviceType; + outDevComm->ginHandles[ctx] = comm->sharedRes->ginState.ginDevHandles[ctx]->handle; + } + } - NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->rank, comm->nRanks, 0xbeef), ret, fail); + CUDACHECKGOTO(cudaStreamSynchronize(stream), ret, fail_stream_mem_win_signals); - cudaStreamDestroy(stream); + NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->rank, comm->nRanks, 0xbeef), ret, fail_stream_mem_win_signals); + CUDACHECKGOTO(cudaStreamDestroy(stream), ret, fail_stream_mem_win_signals); return ret; -fail: - if (win != nullptr) { - symWindowDestroy(comm, win->vidmem, stream); - cudaStreamSynchronize(stream); - } - if (mem != nullptr) { - symMemoryDropRef(comm, mem); - } - if (memHandle != 0x0) { - CUCHECKIGNORE(cuMemRelease(memHandle)); - } - if (stream != nullptr) { - cudaStreamDestroy(stream); +fail_stream_mem_win_signals: + if (devr->ginEnabled) { + ncclGinFreeSignalsCounters(comm, + outDevComm->ginSignalBase, outDevComm->ginSignalCount, + outDevComm->ginCounterBase, outDevComm->ginCounterCount + ); } +fail_stream_mem_win: + symWindowDestroy(comm, win->vidmem, stream); + cudaStreamSynchronize(stream); +fail_stream_mem: + if (memHandle != 0x0) { CUCHECKIGNORE(cuMemRelease(memHandle)); } + symMemoryDropRef(comm, mem); +fail_stream: + cudaStreamDestroy(stream); +fail: return ret; } @@ -905,7 +1007,13 @@ NCCL_API(ncclResult_t, ncclDevCommDestroy, ncclComm_t comm, ncclDevComm_t const* ncclResult_t ncclDevCommDestroy( struct ncclComm* comm, struct ncclDevComm const* devComm ) { - //struct ncclDevrState* devr = &comm->devrState; + struct ncclDevrState* devr = &comm->devrState; + if (devr->ginEnabled) { + ncclGinFreeSignalsCounters(comm, + devComm->ginSignalBase, devComm->ginSignalCount, + devComm->ginCounterBase, devComm->ginCounterCount + ); + } if (devComm->resourceWindow != nullptr) { NCCLCHECK(ncclCommWindowDeregister(comm, devComm->resourceWindow)); } @@ -920,7 +1028,7 @@ ncclResult_t ncclDevrGetLsaRankPtr(struct ncclComm* comm, struct ncclDevrWindow* } struct ncclDevrState* devr = &comm->devrState; - + // Validate lsaRank is within bounds if (lsaRank < 0 || lsaRank >= devr->lsaSize) { return ncclInvalidArgument; @@ -949,7 +1057,7 @@ ncclResult_t ncclDevrGetLsaTeamPtrMC(struct ncclComm* comm, struct ncclDevrWindo bool multimem = true; struct ncclDevrTeam* tm; NCCLCHECK(symTeamObtain(comm, lsaTeam, multimem, &tm)); - + // Return the base multicast address for this team with offset *outPtr = (void*)((uintptr_t)tm->mcBasePtr + winHost->bigOffset + offset); return ncclSuccess; diff --git a/src/device/CMakeLists.txt b/src/device/CMakeLists.txt index 98447428d..acaa9b65d 100644 --- a/src/device/CMakeLists.txt +++ b/src/device/CMakeLists.txt @@ -50,9 +50,9 @@ set_target_properties(nccl_device PROPERTIES # Set include directories for the target target_include_directories(nccl_device PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_BINARY_DIR}/include ${CMAKE_SOURCE_DIR}/src/include ${CMAKE_SOURCE_DIR}/src/include/plugin - ${CMAKE_BINARY_DIR}/include ${CUDAToolkit_INCLUDE_DIRS} ${CUDAToolkit_INCLUDE_DIRS}/cccl ) diff --git a/src/device/Makefile b/src/device/Makefile index fd8f2759d..cf0fa0637 100644 --- a/src/device/Makefile +++ b/src/device/Makefile @@ -23,12 +23,13 @@ INCFLAGS = -I. -I.. -I$(BUILDDIR)/include -I../include -I../include/plugin NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden" CXXFLAGS += $(INCFLAGS) -NVCUFLAGS_SYM := -ccbin $(CXX) $(CXXSTD) --expt-extended-lambda -Xptxas -maxrregcount=128 -Xfatbin -compress-all +NVCUFLAGS_SYM += -ccbin $(CXX) $(CXXSTD) --expt-extended-lambda -Xptxas -maxrregcount=128 -Xfatbin -compress-all NVCUFLAGS_SYM += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden" SAY = @bash -c 'path="$$2"; [[ "$$(realpath "$$2")" =~ ^$(subst .,\.,$(abspath $(NCCLDIR)))/(.*)$$ ]] && path="$${BASH_REMATCH[1]}"; printf "%-15s %s\n" "$$1" "$$path"' SAY COMPILE.cu = $(NVCC) $(NVCUFLAGS) -dc $2 -o $1 +COMPILE.kernel = $(NVCC) $(NVCUFLAGS) -dw $2 -o $1 COMPILE.cc = $(CXX) $(CXXFLAGS) -c $2 -o $1 define COMPILE @$(SAY) "Compiling" $2;\ diff --git a/src/device/generate.py b/src/device/generate.py index aefba9422..4b081924e 100755 --- a/src/device/generate.py +++ b/src/device/generate.py @@ -269,7 +269,7 @@ def validate(coll, redop, ty, algo, proto): # List of all kernel function pointers. out("extern int const ncclDevKernelCount = %d;\n" % len(kernel_funcs)) - out("extern void* const ncclDevKernelList[] = {\n") + out("void* ncclDevKernelList[] = {\n") index = 0 for kfn in kernel_funcs: cudart, _ = required_cuda(*kfn) @@ -281,6 +281,14 @@ def validate(coll, redop, ty, algo, proto): out("nullptr};\n") out("\n") + out("int ncclDevKernelRequirements[] = {\n") + for index,kfn in enumerate(kernel_funcs): + cudart,_ = required_cuda(*kfn) + sym = paste("_", "ncclDevKernel", *kfn) + out(" %7d, /*%4d %s*/\n" % (cudart or 0, index, sym)); + out("};\n") + out("\n") + # Maps primary id to kernel function pointer. out("extern void* const ncclDevKernelForFunc[] = {\n") index = 0 diff --git a/src/device/network/unpack/unpack.h b/src/device/network/unpack/unpack.h index 941b4328d..7ecad5cee 100644 --- a/src/device/network/unpack/unpack.h +++ b/src/device/network/unpack/unpack.h @@ -248,7 +248,7 @@ inline __device__ void ncclNetDeviceUnpackInner( for (int x = 0; x < iter_meta_cnt; x++) { int meta_idx = x + w * PPW; - + // load page offs loadShmem128(shmemCvtPtr((uint64_t*) (s_meta + meta_idx)), meta.r64[0], meta.r64[1]); diff --git a/src/device/reduce_kernel.h b/src/device/reduce_kernel.h index d36dfe5a7..16a9ed5d9 100644 --- a/src/device/reduce_kernel.h +++ b/src/device/reduce_kernel.h @@ -778,7 +778,7 @@ struct FuncSumPostDiv { using UintType = typename std::conditional::type; uint32_t divisor:31, isSigned:1; UintType recip; - + __device__ __forceinline__ FuncSumPostDiv(uint64_t opArg=0) { isSigned = opArg & 1; divisor = opArg >> 1; diff --git a/src/device/symmetric/all_gather.cuh b/src/device/symmetric/all_gather.cuh index 9f050836c..98a755127 100644 --- a/src/device/symmetric/all_gather.cuh +++ b/src/device/symmetric/all_gather.cuh @@ -352,7 +352,7 @@ static __device__ void ncclSymkRun_AllGather_LL_impl(ncclSymkDevWorkArgs const* char* blockInput = input.localPtr(); char* blockOutput = output.localPtr(); - uint32_t lowBits = nElts; + uint32_t lowBits = nAllElts; lowBits |= (uintptr_t)blockInput; lowBits |= (uintptr_t)blockOutput; if (__builtin_expect(lowBits%8 == 0, true)) { diff --git a/src/device/symmetric/generate.py b/src/device/symmetric/generate.py index 8e62bda5b..972e5ca93 100755 --- a/src/device/symmetric/generate.py +++ b/src/device/symmetric/generate.py @@ -249,12 +249,20 @@ def partition(vals, keyfn): emitln(f, '') emitln(f, 'extern int const ncclSymkKernelCount = %d;' % len(list(enumerate_kernels()))) - emitln(f, 'extern void* const ncclSymkKernelList[] = {') + emitln(f, 'void* ncclSymkKernelList[] = {') for k in enumerate_kernels(): emitln(f, '(void*){cname},'.format(cname=kernel_cname(k))) emitln(f, 'nullptr};') emitln(f, '') + emitln(f, 'int ncclSymkKernelRequirements[] = {') + for index,k in enumerate(enumerate_kernels()): + cudart, _, _ = required_cuda(k) + sym = kernel_cname(k) + emitln(f, ' %7d, /*%4d %s*/' % (cudart or 0, index, sym)); + emitln(f, '};') + emitln(f, '') + emitln(f, 'void* ncclSymkGetKernelPtr(ncclSymkKernelId id, int red, ncclDataType_t ty) {') indents += 1 emitln(f, 'switch (id) {') diff --git a/src/device/symmetric/primitives.cuh b/src/device/symmetric/primitives.cuh index 73305d54c..dfdde0e50 100644 --- a/src/device/symmetric/primitives.cuh +++ b/src/device/symmetric/primitives.cuh @@ -56,13 +56,14 @@ struct ncclSymkArgsHandler { workLo++; fracLo = 0; } - struct ncclSymkDevWork const& dw = devWork[workLo]; - indexLo = ((fracLo * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell; + struct ncclSymkDevWork const& dwLo = devWork[workLo]; + indexLo = ((fracLo * divUp(dwLo.nElts, EltPerCell)) >> 16) * EltPerCell; // Where the work ends workHi = channelWorkRange[block].workHi; fracHi = channelWorkRange[block].fracHi + 1; - indexHi = min(((fracHi * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell, dw.nElts); + struct ncclSymkDevWork const& dwHi = devWork[workHi]; + indexHi = min(((fracHi * divUp(dwHi.nElts, EltPerCell)) >> 16) * EltPerCell, dwHi.nElts); } template @@ -78,7 +79,7 @@ struct ncclSymkArgsHandler { lastBlock = dw.sChannelId+dw.nChannels-1; // Where the work begins - fracLo = (dw.sChannelId==0) ? 0 : ((channelWorkRange[dw.sChannelId-1].fracHi + 1) & 0xFFFF); + fracLo = (dw.sChannelId>0 && channelWorkRange[dw.sChannelId-1].workHi == w) ? ((channelWorkRange[dw.sChannelId-1].fracHi + 1) & 0xFFFF) : 0; indexLo = ((fracLo * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell; fracHi = (channelWorkRange[lastBlock].workHi == w) ? channelWorkRange[lastBlock].fracHi + 1 : 0x10000; indexHi = min(((fracHi * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell, dw.nElts); @@ -91,16 +92,16 @@ struct ncclSymkArgsHandler { getWorkRange(blockIdx.x, workLo, indexLo, workHi, indexHi); - size_t currentIndexLo = indexLo; #pragma unroll 1 for (int w = workLo; w <= workHi; w++) { struct ncclSymkDevWork const& dw = devWork[w]; size_t const& nAllElts = dw.nElts; - size_t currentIndexHi; + size_t currentIndexLo, currentIndexHi; int block, nBlocks; if (blockIdx.x >= dw.sChannelId && blockIdx.x < dw.sChannelId + dw.nChannels) { getWorkRangeFused(blockIdx.x, w, block, nBlocks, currentIndexLo, currentIndexHi); } else { + currentIndexLo = (w > workLo) ? 0 : indexLo; currentIndexHi = (w < workHi) ? nAllElts : indexHi; block = 0; nBlocks = 1; diff --git a/src/device/symmetric/reduce_scatter.cuh b/src/device/symmetric/reduce_scatter.cuh index 8f79b3990..850960845 100644 --- a/src/device/symmetric/reduce_scatter.cuh +++ b/src/device/symmetric/reduce_scatter.cuh @@ -241,7 +241,7 @@ __device__ __forceinline__ void ncclSymkRun_ReduceScatter_LD(ncclSymkDevWorkArgs threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); int tn = nBlocks*blockDim.x; - reduce(handler, tn, t, nBlocks, waitNeeded, bar, red, input + rank*nElts, output, nElts); + reduce(handler, tn, t, nBlocks, waitNeeded, bar, red, input + rank*nAllElts, output, nElts); waitNeeded = false; } @@ -323,7 +323,7 @@ __device__ __forceinline__ void ncclSymkRun_ReduceScatter_LDMC(ncclSymkDevWorkAr threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); int tn = nBlocks*blockDim.x; - reduceMultimem(tn, t, red, input.multimemPtr(multimem) + rank*nElts, output.localPtr(), nElts); + reduceMultimem(tn, t, red, input.multimemPtr(multimem) + rank*nAllElts, output.localPtr(), nElts); } ); @@ -402,7 +402,7 @@ __device__ __forceinline__ void ncclSymkRun_ReduceScatter_LL(ncclSymkDevWorkArgs T* input = (T*)inputPtr.localPtr(); T* output = (T*)outputPtr.localPtr(); - uint32_t lowBits = nElts*sizeof(T); + uint32_t lowBits = nAllElts*sizeof(T); lowBits |= (uintptr_t)input; lowBits |= (uintptr_t)output; if (__builtin_expect(lowBits%8 == 0, true)) { diff --git a/src/enqueue.cc b/src/enqueue.cc index 00a0ef8da..da45abe6f 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -32,16 +32,28 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma int carveout = ncclParamL1SharedMemoryCarveout(); int ncclMaxSharedMem = ncclShmemDynamicSize(cudaArch); + int driverVersion; + NCCLCHECK(ncclCudaDriverVersion(&driverVersion)); + for (int sym=0; sym <= 1; sym++) { int kcount = sym==0 ? ncclDevKernelCount : ncclSymkKernelCount; - void* const* kptrs = sym==0 ? ncclDevKernelList : ncclSymkKernelList; + void** kptrs = sym==0 ? ncclDevKernelList : ncclSymkKernelList; + int* krequires = sym==0 ? ncclDevKernelRequirements : ncclSymkKernelRequirements; for (int k=0; k < kcount; k++) { + if (kptrs[k] != nullptr && driverVersion < krequires[k]) { + INFO(NCCL_INIT, "Skipping %skernel %d which requires driver %d", + sym ? "symmetric " : "", k, krequires[k]); + kptrs[k] = nullptr; + } void* fn = kptrs[k]; cudaFuncAttributes attr = {0}; if (fn == nullptr) continue; cudaError_t errcode = cudaFuncGetAttributes(&attr, fn); - if (errcode != cudaSuccess) continue; // Silently ignore failures + if (errcode != cudaSuccess) { + cudaGetLastError(); // Drain error code + continue; // Silently ignore failures + } if (maxStackSize) { if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes; } @@ -116,9 +128,14 @@ static void addWorkBatchToPlan( // batch further down. newBatch |= NCCL_MAX_DEV_WORK_BATCH_BYTES < chan->wipBatch.workBytes + workSize; if (workType == ncclDevWorkTypeP2p) { + // We only allow NCCL_MAX_DEV_WORK_P2P_PER_BATCH ops per batch. newBatch |= chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH; - for (int i=0; i < chan->wipBatch.nP2ps; i++) { + for (int i = 0; i < chan->wipBatch.nP2ps; i++) { + // Do not allow the same round twice in the same batch. newBatch |= p2pRound == chan->wipBatch.p2pRounds[i]; + // Make sure we only aggregate p2p operations within the same p2p round epoch (one epoch is NCCL_MAX_DEV_WORK_P2P_PER_BATCH ops). + // This enforces uniform batching accross ranks in the communicator and prevents hangs. + newBatch |= (p2pRound / NCCL_MAX_DEV_WORK_P2P_PER_BATCH) != (chan->wipBatch.p2pRounds[i] / NCCL_MAX_DEV_WORK_P2P_PER_BATCH); } } } @@ -2447,7 +2464,7 @@ static ncclResult_t collTaskAppend( NCCLCHECK(ncclProfilerStartGroupApiEvent(info, isGraphCaptured)); NCCLCHECK(ncclProfilerRecordGroupApiEventState(ncclProfilerGroupStartApiStop)); NCCLCHECK(ncclProfilerStartCollApiEvent(info, isGraphCaptured)); - + struct ncclTaskColl* t = ncclMemoryPoolAlloc(&comm->memPool_ncclTaskColl, &comm->memPermanent); t->func = info->coll; t->sendbuff = info->sendbuff; @@ -2484,7 +2501,7 @@ static ncclResult_t ceCollTaskAppend( struct ncclDevrWindow* recvWin, struct ncclDevRedOpFull opDev) { struct ncclKernelPlanner *planner = &comm->planner; - + // Check if CE needs initialization if (comm->ceColl.baseUCSymReadyPtr == NULL && ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) { struct ncclCeInitTask* ceTask; @@ -2558,7 +2575,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { ncclDevrFindWindow(comm, info->sendbuff, &sendWin); ncclDevrFindWindow(comm, info->recvbuff, &recvWin); bool ceImplemented = ncclCeImplemented(info->coll, info->op, info->datatype); - + // Append CE collective task if CE is supported and requested by user if (comm->symmetricSupport && comm->nNodes == 1 && sendWin && recvWin && (sendWin->winFlags & recvWin->winFlags & NCCL_WIN_COLL_SYMMETRIC) && comm->config.CTAPolicy == NCCL_CTA_POLICY_ZERO && ceImplemented) { NCCLCHECK(ceCollTaskAppend(comm, info, sendWin, recvWin, opDev)); @@ -2601,16 +2618,21 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { } ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { + // Early-out on invalid or revoked communicator + ncclResult_t ret = CommCheck(info->comm, info->opName, "comm"); + if (ret != ncclSuccess) return ncclGroupErrCheck(ret); + if (info->comm->revokedFlag) { + WARN("%s: communicator was revoked", info->opName); + return ncclGroupErrCheck(ncclInvalidUsage); + } // Profiler - If a group API event has already started, update the profilerGroupDepth so that the depth // updates correctly for implicit ncclGroupStartInternal and ncclGroupEndInternal calls if (ncclProfilerApiState.profilerGroupDepth > 0) { ncclProfilerApiState.profilerGroupDepth++; } NCCLCHECK(ncclGroupStartInternal()); - ncclResult_t ret = ncclSuccess; + ret = ncclSuccess; int devOld = -1; - - NCCLCHECKGOTO(CommCheck(info->comm, info->opName, "comm"), ret, fail); // Check whether communicator is ready to communicate NCCLCHECKGOTO(ncclCommEnsureReady(info->comm), ret, fail); diff --git a/src/gin/CMakeLists.txt b/src/gin/CMakeLists.txt new file mode 100644 index 000000000..e20d7ddf3 --- /dev/null +++ b/src/gin/CMakeLists.txt @@ -0,0 +1,8 @@ +# Gin sources +set(GIN_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/gin_host.cc + ${CMAKE_CURRENT_SOURCE_DIR}/gin_host_proxy.cc +) + +# Add gin sources to parent scope +set(GIN_SOURCES ${GIN_SOURCES} PARENT_SCOPE) diff --git a/src/gin/gin_host.cc b/src/gin/gin_host.cc new file mode 100644 index 000000000..b42f88fde --- /dev/null +++ b/src/gin/gin_host.cc @@ -0,0 +1,277 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "comm.h" +#include "param.h" +#include "graph.h" +#include "transport.h" +#include "register_inline.h" +#include "gin/gin_host.h" +#include "gin/gin_host_proxy.h" + +NCCL_PARAM(GinEnable, "GIN_ENABLE", 1); +NCCL_PARAM(GinType, "GIN_TYPE", -1); +NCCL_PARAM(GinSignalPoolSize, "GIN_SIGNAL_POOL_SIZE", 64 << 10); +NCCL_PARAM(GinCounterPoolSize, "GIN_COUNTER_POOL_SIZE", 64 << 10); + +void* ncclGinProgress(void* ginState_) { + struct ncclGinState* ginState = (struct ncclGinState*)ginState_; + while (1) { + pthread_mutex_lock(&ginState->threadLock); + if (ginState->ginProgress == 1) { + pthread_mutex_unlock(&ginState->threadLock); + for (int n=0; nginCommCount; n++) { + ncclResult_t ret; + if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) { + ret = ncclGinProxyProgress(ginState->ncclGin, ginState->ginCtx[n]); + } else { + ret = ginState->ncclGin->ginProgress(ginState->ginComms[n]); + } + if (ret != ncclSuccess) { + __atomic_store_n(&ginState->asyncResult, ret, __ATOMIC_RELEASE); + INFO(NCCL_ALL,"%s:%d -> %d [GIN Progress Thread]", __FILE__, __LINE__, ret); + ginState->ginProgress = -2; + return NULL; + } + } + sched_yield(); + } else if (ginState->ginProgress == -1) { + pthread_mutex_unlock(&ginState->threadLock); + return NULL; + } else if (ginState->ginProgress == 0) { + pthread_cond_wait(&ginState->threadCond, &ginState->threadLock); + pthread_mutex_unlock(&ginState->threadLock); + } else { + pthread_mutex_unlock(&ginState->threadLock); + INFO(NCCL_ALL,"%s:%d -> [GIN Progress Thread] state unknown %d", __FILE__, __LINE__, ginState->ginProgress); + ginState->ginProgress = -2; + return NULL; + } + } +} + +NCCL_PARAM(GinNcontexts, "GIN_NCONTEXTS", NCCL_GIN_MAX_CONTEXTS); + +ncclResult_t ncclGinConnectOnce(struct ncclComm* comm) { + ncclResult_t ret = ncclSuccess; + struct ncclGinState* ginState = &comm->sharedRes->ginState; + if (ginState->ncclGin == NULL) { + WARN("GIN not supported."); + return ncclInvalidUsage; + } + if (ncclParamGinEnable() == 0) { + WARN("GIN is disabled."); + return ncclInternalError; + } + if (ginState->connected) return ncclSuccess; + + NCCLCHECK(ginState->ncclGin->init(&ginState->ginInstance, comm->commHash, ncclDebugLog)); + + int ndev = 0; + NCCLCHECK(ginState->ncclGin->devices(&ndev)); + if (ndev <= 0) { + WARN("No GIN-capable devices found."); + return ncclInternalError; + } + + ncclNetProperties_t props; + NCCLCHECK(ginState->ncclGin->getProperties(0, &props)); + ginState->ginType = props.netDeviceType; + if ((ncclParamGinType() != -1) && (ginState->ginType != ncclParamGinType())) { + WARN("GIN-capable device type mismatch."); + return ncclInternalError; + } + + int nLocalNets; + int64_t localNets[NCCL_TOPO_MAX_NODES]; + NCCLCHECK(ncclTopoGetLocalNets(comm->topo, comm->rank, localNets, &nLocalNets)); + + void** handles = NULL; + char* allHandles = NULL; + + ginState->ginCommCount = std::min(NCCL_GIN_MAX_CONTEXTS, ncclParamGinNcontexts()); + + NCCLCHECKGOTO(ncclCalloc(&allHandles, (size_t)comm->nRanks * NCCL_NET_HANDLE_MAXSIZE), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&handles, comm->nRanks), ret, fail); + for (int r = 0; r < comm->nRanks; r++) handles[r] = allHandles + r * NCCL_NET_HANDLE_MAXSIZE; + + ginState->signalSpaceSize = ncclParamGinSignalPoolSize(); + if (ginState->signalSpaceSize < 0 || (1 << 30) <= ginState->signalSpaceSize) { + WARN("NCCL_GIN_SIGNAL_POOL_SIZE has invalid value."); + ginState->signalSpaceSize = 64 << 10; + } + ginState->counterSpaceSize = ncclParamGinCounterPoolSize(); + if (ginState->counterSpaceSize < 0 || (1 << 30) <= ginState->counterSpaceSize) { + WARN("NCCL_GIN_COUNTER_POOL_SIZE has invalid value."); + ginState->counterSpaceSize = 64 << 10; + } + + for (int n = 0; n < ginState->ginCommCount; n++) { + void* listenComm; + NCCLCHECKGOTO( + ginState->ncclGin->listen(ginState->ginInstance, localNets[n%nLocalNets], + allHandles + NCCL_NET_HANDLE_MAXSIZE * comm->rank, &listenComm), + ret, fail); + NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allHandles, NCCL_NET_HANDLE_MAXSIZE), ret, + fail); + NCCLCHECKGOTO(ginState->ncclGin->connect(comm->ginContext, handles, comm->nRanks, comm->rank, + listenComm, ginState->ginComms + n), + ret, fail); + if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) { + NCCLCHECKGOTO(ncclGinProxyCreateContext(comm, ginState->ginComms[n], localNets[n%nLocalNets], + ginState->signalSpaceSize, ginState->counterSpaceSize, + &ginState->ginCtx[n], &ginState->ginDevHandles[n]), + ret, fail); + } else { + NCCLCHECKGOTO(ginState->ncclGin->createContext( + ginState->ginComms[n], ginState->signalSpaceSize, ginState->counterSpaceSize, + &ginState->ginCtx[n], &ginState->ginDevHandles[n]), + ret, fail); + } + NCCLCHECKGOTO(ginState->ncclGin->closeListen(listenComm), ret, fail); + } + free(handles); + handles = NULL; + free(allHandles); + allHandles = NULL; + + // Check whether we need proxy progress and if so, start / wake up the progress thread. + ginState->needsProxyProgress = 0; + for (int n = 0; n < ginState->ginCommCount; n++) { + if (ginState->ginDevHandles[n]->needsProxyProgress) ginState->needsProxyProgress = 1; + } + if (ginState->needsProxyProgress) { + ginState->ginProgress = 1; + pthread_mutex_init(&ginState->threadLock, NULL); + pthread_cond_init(&ginState->threadCond, NULL); + PTHREADCHECK(pthread_create(&ginState->thread, NULL, ncclGinProgress, ginState), "pthread_create"); + ncclSetThreadName(ginState->thread, "NCCL GIN Progress%2d", comm->cudaDev); + } + + ncclSpaceConstruct(&ginState->counterSpace); + ncclSpaceConstruct(&ginState->signalSpace); + +exit: + if (ret == ncclSuccess) ginState->connected = true; + return ret; +fail: + free(allHandles); + free(handles); + goto exit; +} + +ncclResult_t ncclGinFinalize(struct ncclComm* comm) { + struct ncclGinState* ginState = &comm->sharedRes->ginState; + if (!ginState->connected) return ncclSuccess; + + if (ginState->needsProxyProgress) { + pthread_mutex_lock(&ginState->threadLock); + comm->sharedRes->ginState.ginProgress = -1; + pthread_cond_signal(&ginState->threadCond); + pthread_mutex_unlock(&ginState->threadLock); + PTHREADCHECK(pthread_join(ginState->thread, NULL), "pthread_join"); + } + + if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) { + for (int n = 0; n < ginState->ginCommCount; n++) { + if (ginState->ginCtx[n] != NULL) { + NCCLCHECK(ncclGinProxyDestroyContext(ginState->ncclGin, ginState->ginCtx[n])); + ginState->ginCtx[n] = NULL; + } + } + } + + for (int n = 0; n < ginState->ginCommCount; n++) { + if (ginState->ginCtx[n] != NULL) { + NCCLCHECK(ginState->ncclGin->destroyContext(ginState->ginCtx[n])); + ginState->ginCtx[n] = NULL; + } + if (ginState->ginComms[n] != NULL) { + NCCLCHECK(ginState->ncclGin->closeColl(ginState->ginComms[n])); + ginState->ginComms[n] = NULL; + } + } + NCCLCHECK(ginState->ncclGin->finalize(ginState->ginInstance)); + memset(ginState, 0, sizeof(*ginState)); + return ncclSuccess; +} + +ncclResult_t ncclGinRegister(struct ncclComm* comm, void* address, size_t size, + void* ginHostWins[NCCL_GIN_MAX_CONTEXTS], + ncclGinWindow_t ginDevWins[NCCL_GIN_MAX_CONTEXTS]) { + struct ncclGinState* ginState = &comm->sharedRes->ginState; + for (int n = 0; n < ginState->ginCommCount; n++) { + if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) { + NCCLCHECK(ncclGinProxyRegister(ginState->ncclGin, ginState->ginCtx[n], address, size, + NCCL_PTR_CUDA, 0, &ginHostWins[n], &ginDevWins[n])); + } else { + NCCLCHECK(ginState->ncclGin->regMrSym(ginState->ginComms[n], address, size, NCCL_PTR_CUDA, 0, + &ginHostWins[n], &ginDevWins[n])); + } + if (ginHostWins[n] == NULL) { + WARN("rank %d - GIN Symmetric register failed: buff %p, size %ld", comm->rank, address, size); + return ncclSystemError; + } + } + return ncclSuccess; +} + +ncclResult_t ncclGinDeregister(struct ncclComm* comm, void* ginHostWins[NCCL_GIN_MAX_CONTEXTS]) { + struct ncclGinState* ginState = &comm->sharedRes->ginState; + for (int n = 0; n < ginState->ginCommCount; n++) { + if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) { + NCCLCHECK(ncclGinProxyDeregister(ginState->ncclGin, ginState->ginCtx[n], ginHostWins[n])); + } else { + NCCLCHECK(ginState->ncclGin->deregMrSym(ginState->ginComms[n], ginHostWins[n])); + } + } + return ncclSuccess; +} + +ncclResult_t ncclGinAllocSignalsCounters(struct ncclComm* comm, int nSignals, uint32_t* outSignal0, + int nCounters, uint32_t* outCounter0) { + ncclResult_t ret = ncclSuccess; + struct ncclGinState* ginState = &comm->sharedRes->ginState; + int64_t start; + if (nSignals != 0) { + NCCLCHECKGOTO( + ncclSpaceAlloc(&ginState->signalSpace, ginState->signalSpaceSize, nSignals, 1, &start), ret, + fail); + *outSignal0 = (uint32_t)start; + } + if (nCounters != 0) { + NCCLCHECKGOTO( + ncclSpaceAlloc(&ginState->counterSpace, ginState->counterSpaceSize, nCounters, 1, &start), + ret, fail_signals); + *outCounter0 = (uint32_t)start; + } + return ncclSuccess; +fail_signals: + if (nSignals != 0) ncclSpaceFree(&ginState->signalSpace, *outSignal0, nSignals); +fail: + return ret; +} + +ncclResult_t ncclGinFreeSignalsCounters(struct ncclComm* comm, uint32_t signal0, int nSignals, + uint32_t counter0, int nCounters) { + struct ncclGinState* ginState = &comm->sharedRes->ginState; + if (nSignals != 0) ncclSpaceFree(&ginState->signalSpace, signal0, nSignals); + if (nCounters != 0) ncclSpaceFree(&ginState->counterSpace, counter0, nCounters); + return ncclSuccess; +} + +ncclResult_t ncclGinQueryLastError(struct ncclGinState* ginState, bool* hasError) { + bool hasError_ = false; + for (int n = 0; n < ginState->ginCommCount; n++) { + if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) + NCCLCHECK(ncclGinProxyQueryLastError(ginState->ncclGin, ginState->ginCtx[n], &hasError_)); + else + NCCLCHECK(ginState->ncclGin->queryLastError(ginState->ginCtx[n], &hasError_)); + if (hasError_) break; + } + *hasError = hasError_; + return ncclSuccess; +} diff --git a/src/gin/gin_host_proxy.cc b/src/gin/gin_host_proxy.cc new file mode 100644 index 000000000..511e38b40 --- /dev/null +++ b/src/gin/gin_host_proxy.cc @@ -0,0 +1,501 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include "nccl.h" +#include "comm.h" +#include "gin/gin_host.h" +#include "alloc.h" +#include "checks.h" +#include "gdrwrap.h" +#include "plugin/nccl_net.h" +#include "nccl_device/gin/proxy/gin_proxy_device_host_common.h" + +NCCL_PARAM(GinProxyQueueSize, "GIN_PROXY_QUEUE_SIZE", -1); +extern int64_t ncclParamIbDataDirect(); +extern int64_t ncclParamDmaBufEnable(); + +struct ginProxyGfdState { + ncclGinProxyOp_t op; + uint16_t counterId; + int done; + void *request; +}; + +// a member might be on the GPU, if it has a *GdrHandle counterpart +struct ginProxyHostGpuCtx { + size_t queueSize; + + // size = nRanks * queueSize + ncclGinProxyGfd_t *queues; + void *cisGdrHandle; + // Consumed Indices, one per rank + uint32_t *cis; + // to decrease the number of reads/writes to cis which might be on the GPU + uint32_t *cisShadow; + // Seen Indices one per rank + uint32_t *sis; + + // same size as queues + struct ginProxyGfdState *states; + // same size as queues + uint64_t *inlines; + // inlines is registered as a memory region with the GIN plugin + void *inlinesMhandle; + void *inlinesGinHandle; +}; + +struct ginProxyCtx { + struct ncclComm *comm; + void *collComm; + ncclNetDeviceHandle_v11_t *devHandle; + ncclNetProperties_t props; + + // GPU queues, if GDR on the GPU, else on the CPU + // Queue size, must be a power of 2 + struct ginProxyHostGpuCtx *hostGpuCtx; + + void *countersGdrHandle; + uint64_t *counters; + uint64_t *countersDev; + CUmemGenericAllocationHandle signalsCumemhandle; + void *signalsMhandle; + void *signalsGinHandle; + uint64_t *signalsDev; + int hasError; +}; + +// Depending on GDR, allocate memory on the CPU or GPU. +// host_flags is not used for now, but it is here for future use. +template +static ncclResult_t allocMemCPUAccessible(T **ptr, T **devPtr, size_t nelem, int host_flags, + void **gdrHandle, bool forceHost = false) { + if (ncclGdrCopy && !forceHost) { + NCCLCHECK(ncclGdrCudaCalloc(ptr, devPtr, nelem, gdrHandle)); + } else { + NCCLCHECK(ncclCuMemHostAlloc((void **)ptr, NULL, nelem * sizeof(T))); + memset((void *)*ptr, 0, nelem * sizeof(T)); + *devPtr = *ptr; + if (gdrHandle) *gdrHandle = NULL; // Mark as host allocated by nulling GDR handle + } + return ncclSuccess; +} + +// Depending on GDR, free memory on the CPU or GPU. +template +static ncclResult_t freeMemCPUAccessible(T *ptr, void *gdrHandle) { + if (gdrHandle != NULL) { // If a GDR handle exists, it was GDR memory + NCCLCHECK(ncclGdrCudaFree(gdrHandle)); + } else { // Otherwise, it was host memory (or GDR was off) + NCCLCHECK(ncclCuMemHostFree(ptr)); + } + return ncclSuccess; +} + +static ncclResult_t getDmaBufFd(void *addr, size_t length, int *fd, + bool forceNonDataDirect = false) { + if (ncclParamDmaBufEnable() == 0) return ncclInvalidUsage; + +#if CUDA_VERSION >= 11070 + static size_t hostPageSize = sysconf(_SC_PAGESIZE); + size_t alignedSize = length; + ALIGN_SIZE(alignedSize, hostPageSize); + +#if CUDA_VERSION >= 12080 + if (ncclParamIbDataDirect() && !forceNonDataDirect) { + CUresult status = pfn_cuMemGetHandleForAddressRange( + (void *)fd, (CUdeviceptr)addr, alignedSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, + CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE); + if (status == CUDA_SUCCESS) return ncclSuccess; + } +#endif + CUresult status = pfn_cuMemGetHandleForAddressRange((void *)fd, (CUdeviceptr)addr, alignedSize, + CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0); + if (status == CUDA_SUCCESS) return ncclSuccess; +#endif + + return ncclInvalidUsage; +} + +static ncclResult_t proxyGinPollCompletions(ncclGin_t *ginComm, void *collComm, + struct ginProxyCtx *ctx, + struct ginProxyHostGpuCtx *hostGpuCtx) { + for (int targetRank = 0; targetRank < ctx->comm->nRanks; targetRank++) { + // loop on all seen but unconsumed GFDs + for (uint32_t i = hostGpuCtx->cisShadow[targetRank]; i < hostGpuCtx->sis[targetRank]; i++) { + uint32_t idx = i & (hostGpuCtx->queueSize - 1); + struct ginProxyGfdState *state = + &hostGpuCtx->states[targetRank * hostGpuCtx->queueSize + idx]; + // no need to poll if already done + if (!state->done) { + ginComm->test(collComm, state->request, &state->done); + if (state->done) { + TRACE(NCCL_NET, "GFD completed - stateIdx: %lu, request: %p", state - hostGpuCtx->states, + state->request); + // update the counter specified in the GFD + if (state->op & ncclGinProxyOpWithCounter) { + __atomic_store_n(&ctx->counters[state->counterId], ctx->counters[state->counterId] + 1, + __ATOMIC_RELAXED); + TRACE(NCCL_NET, "Updated counter %d to %ld", state->counterId, + ctx->counters[state->counterId]); + } + } + } + // allow holes in the CI space to get resolved + if (state->done && i == hostGpuCtx->cisShadow[targetRank]) { + // tell the GPU that we have consumed the GFD + __atomic_store_n(&hostGpuCtx->cis[targetRank], ++hostGpuCtx->cisShadow[targetRank], + __ATOMIC_RELAXED); + TRACE(NCCL_NET, "Updated cis[%u] to %u", targetRank, hostGpuCtx->cisShadow[targetRank]); + } + } + } + + return ncclSuccess; +} + +static int proxyGinPollGfd(struct ginProxyCtx *ctx, ginProxyHostGpuCtx *hostGpuCtx, int targetRank, + ncclGinProxyGfd_t *gfd, struct ginProxyGfdState **state) { + ncclGinProxyGfd_t *q = hostGpuCtx->queues + targetRank * hostGpuCtx->queueSize; + uint32_t idx = hostGpuCtx->sis[targetRank] & (hostGpuCtx->queueSize - 1); + ncclGinProxyQword_t qword; + __atomic_load(&q[idx].qword[ncclGinProxyGfdHeader].raw, &qword.raw, __ATOMIC_RELAXED); + if (qword.flag.v == 0) { + return 0; + } + + // We know for sure that the first qword is there, copy it. + gfd->qword[ncclGinProxyGfdHeader] = q[idx].qword[ncclGinProxyGfdHeader]; + // Wait for and copy the other qwords. + for (int k = 1; k < ncclGinProxyGfdQwords; k++) { + do { + __atomic_load(&q[idx].qword[k].raw, &qword.raw, __ATOMIC_RELAXED); + } while (qword.flag.v == 0); + gfd->qword[k] = qword; + } + // Now we have the full GFD in the local struct. + + // Reset the GFD in the queue. This lets the producer know that the GFD is consumed. + for (int k = 0; k < ncclGinProxyGfdQwords; k++) { + __atomic_store_n(&q[idx].qword[k].raw, 0, __ATOMIC_RELAXED); + } + + // set the counter_id into the state + uint32_t stateIdx = targetRank * hostGpuCtx->queueSize + idx; + *state = &hostGpuCtx->states[stateIdx]; + (*state)->op = (ncclGinProxyOp_t)(gfd->qword[ncclGinProxyGfdHeader].header.op); + (*state)->counterId = gfd->qword[ncclGinProxyGfdCompletion].completion.counterId; + (*state)->done = 0; + (*state)->request = NULL; + + TRACE(NCCL_NET, + "GFD to target PE %d raw idx: %u, idx: %u - op: %#lx, size: %lu, srcOff: %lu, dstOff: %lu, " + "srcHandle: %lu, dstHandle: %lu, counterId: %u, signalId: %u, stateIdx: %u", + targetRank, hostGpuCtx->sis[targetRank], idx, gfd->qword[ncclGinProxyGfdHeader].header.op, + gfd->qword[ncclGinProxyGfdHeader].header.size, + gfd->qword[ncclGinProxyGfdSrcOff].srcOff.srcOff, + gfd->qword[ncclGinProxyGfdDstOff].dstOff.dstOff, + gfd->qword[ncclGinProxyGfdSrcHandle].srcHandle.srcHandle, + gfd->qword[ncclGinProxyGfdDstHandle].dstHandle.dstHandle, + gfd->qword[ncclGinProxyGfdCompletion].completion.counterId, + gfd->qword[ncclGinProxyGfdCompletion].completion.signalId, stateIdx); + + hostGpuCtx->sis[targetRank]++; + + return 1; +} + +static int mapGfdOpToCollNetOp(ncclGinProxyGfd_t *gfd) { + switch (gfd->qword[ncclGinProxyGfdHeader].header.op & + (ncclGinProxyOpComplMask & ~ncclGinProxyOpWithCounter)) { + case ncclGinProxyOpWithSignalInc: + return NCCL_NET_SIGNAL_OP_INC; + case ncclGinProxyOpWithSignalAdd: + return NCCL_NET_SIGNAL_OP_ADD; + default: + return -1; + } +} + +static ncclResult_t proxyGinProcessGfd(ncclGin_t *ginComm, void *collComm, struct ginProxyCtx *ctx, + struct ginProxyHostGpuCtx *hostGpuCtx, int targetRank, + ncclGinProxyGfd_t *gfd, struct ginProxyGfdState *state) { + int signalOp; + uint64_t signalVal; + + uint64_t size = gfd->qword[ncclGinProxyGfdHeader].header.size; + uint64_t srcOff; + void *srcHandle; + if (gfd->qword[ncclGinProxyGfdHeader].header.op & ncclGinProxyOpWithInline) { + uint64_t *inlineVal = &hostGpuCtx->inlines[gfd - hostGpuCtx->queues]; + srcOff = (uint64_t)&inlineVal[0] - (uint64_t)hostGpuCtx->inlines; + // reconstruct the inline value from the two qwords + *inlineVal = gfd->qword[ncclGinProxyGfdInlineLow].inlineLow.inlineValLow; + if (size == 8) { + *inlineVal |= (uint64_t)gfd->qword[ncclGinProxyGfdInlineLow].inlineLow.inlineValLow2 << 32; + *inlineVal |= (uint64_t)gfd->qword[ncclGinProxyGfdInlineHigh].inlineHigh.inlineValHigh << 48; + } + srcHandle = hostGpuCtx->inlinesMhandle; + } else { + srcOff = gfd->qword[ncclGinProxyGfdSrcOff].srcOff.srcOff; + srcHandle = (void *)(uint64_t)gfd->qword[ncclGinProxyGfdSrcHandle].srcHandle.srcHandle; + } + uint64_t dstOff = gfd->qword[ncclGinProxyGfdDstOff].dstOff.dstOff; + void *dstHandle = (void *)(uint64_t)gfd->qword[ncclGinProxyGfdDstHandle].dstHandle.dstHandle; + + switch (gfd->qword[ncclGinProxyGfdHeader].header.op & ncclGinProxyOpBaseMask) { + case ncclGinProxyOpPut: + signalOp = mapGfdOpToCollNetOp(gfd); + if (signalOp == -1) { + // First cast from 63 bits to 64 bits and then to void * to avoid warnings + NCCLCHECK(ginComm->iput(collComm, srcOff, srcHandle, size, dstOff, dstHandle, + targetRank, &state->request)); + } else { + // reconstruct the signal value from the two qwords + signalVal = gfd->qword[ncclGinProxyGfdCompletion].completion.signalValLow; + signalVal |= (uint64_t)gfd->qword[ncclGinProxyGfdSignalVal].signalVal.signalValLow2 << 16; + signalVal |= (uint64_t)gfd->qword[ncclGinProxyGfdSignalVal].signalVal.signalValHigh << 32; + uint64_t signalOff = + gfd->qword[ncclGinProxyGfdCompletion].completion.signalId * sizeof(uint64_t); + NCCLCHECK(ginComm->iputSignal(collComm, srcOff, srcHandle, size, dstOff, dstHandle, + targetRank, signalOff, ctx->signalsGinHandle, signalVal, + signalOp, &state->request)); + } + break; + default: + // this error should already have been checked in pollGfd + assert(0); + } + TRACE(NCCL_NET, "GFD submitted into GIN plugin - stateIdx: %lu, request: %p", + state - hostGpuCtx->states, state->request); + return ncclSuccess; +} + +static uint64_t isPowerOfTwo(uint64_t n) { return (n > 0) && ((n & (n - 1)) == 0); } + +// Check if the GIN plugin supports DMA-BUF, if so we can try to get the DMA-BUF handle from CUDA, +// if that fails we fallback to non-DMA-BUF +static ncclResult_t ncclGinProxyRegMrSym(ncclGin_t *ginComm, struct ginProxyCtx *ctx, void *addr, + size_t size, int type, int mr_flags, void **mhandle, + void **ginHandle) { + if (type == NCCL_PTR_HOST) { + NCCLCHECK(ginComm->regMrSym(ctx->collComm, addr, size, type, mr_flags, mhandle, ginHandle)); + } else if (type == NCCL_PTR_CUDA) { + ncclResult_t dmabufResult = ncclInvalidUsage; + if (ncclParamDmaBufEnable() && (ctx->props.ptrSupport & NCCL_PTR_DMABUF)) { + ncclResult_t registrationResult = ncclSuccess; + int dmabufFd = -1; + dmabufResult = getDmaBufFd(addr, size, &dmabufFd); + if (dmabufResult == ncclSuccess) { + registrationResult = ginComm->regMrSymDmaBuf(ctx->collComm, addr, size, type, 0, dmabufFd, + mr_flags, mhandle, ginHandle); + close(dmabufFd); + } + if (registrationResult != ncclSuccess) { + dmabufFd = -1; + dmabufResult = getDmaBufFd(addr, size, &dmabufFd, true); + if (dmabufResult == ncclSuccess) { + NCCLCHECK(ginComm->regMrSymDmaBuf(ctx->collComm, addr, size, type, 0, dmabufFd, + mr_flags, mhandle, ginHandle)); + close(dmabufFd); + } + } + } + // Fallback to non-DMA-BUF if the DMA-BUF handle is not supported + if (dmabufResult != ncclSuccess) { + NCCLCHECK(ginComm->regMrSym(ctx->collComm, addr, size, type, mr_flags, mhandle, ginHandle)); + } + } else { + return ncclInvalidUsage; + } + + return ncclSuccess; +} + +ncclResult_t ncclGinProxyCreateContext(struct ncclComm *comm, void *collComm, int devId, + int nSignals, int nCounters, void **outGinCtx, + ncclNetDeviceHandle_v11_t **outDevHandle) { + ncclGin_t *ginComm = (ncclGin_t *)comm->sharedRes->ginState.ncclGin; + + if (!ncclGdrCopy) + INFO(NCCL_NET, "GIN Proxy will not be using GDRCopy"); + + struct ginProxyCtx *proxyCtx = NULL; + NCCLCHECK(ncclCalloc(&proxyCtx, 1)); + + proxyCtx->comm = comm; + proxyCtx->collComm = collComm; + + // Sanitize the queue size + NCCLCHECK(ginComm->getProperties(devId, &proxyCtx->props)); + uint64_t queueSize = ncclParamGinProxyQueueSize(); + uint32_t maxRequests = NCCL_NET_MAX_REQUESTS * proxyCtx->props.maxRecvs; + if (queueSize == -1) { + queueSize = maxRequests; + } + if (queueSize > maxRequests) { + INFO(NCCL_NET, + "NCCL_GIN_PROXY_QUEUE_SIZE is greater than the maximum outstanding requests in the GIN " + "plugin (%d), using the default/maximum value instead", + maxRequests); + queueSize = maxRequests; + } + if (queueSize < 1) { + INFO(NCCL_NET, + "NCCL_GIN_PROXY_QUEUE_SIZE is less than 1, using the default/maximum value instead"); + queueSize = maxRequests; + } + if (!isPowerOfTwo(queueSize)) { + INFO( + NCCL_NET, + "NCCL_GIN_PROXY_QUEUE_SIZE is not a power of two, using the default/maximum value instead"); + queueSize = maxRequests; + } + + // Allocate the counters on the GPU or CPU depending on GDR + NCCLCHECK(allocMemCPUAccessible(&proxyCtx->counters, &proxyCtx->countersDev, nCounters, + CU_MEMHOSTALLOC_WRITECOMBINED, + &proxyCtx->countersGdrHandle)); + + // Allocate the signals on the GPU and then register the memory region with the GIN plugin. + // Enforcing strong ordering on the signals mr is vital to ensure ordering between puts and + // signals. + size_t signalsBufSize = nSignals * sizeof(uint64_t); + NCCLCHECK(ncclCuMemAlloc((void **)&proxyCtx->signalsDev, &proxyCtx->signalsCumemhandle, + CU_MEM_HANDLE_TYPE_NONE, signalsBufSize)); + CUDACHECK(cudaMemset(proxyCtx->signalsDev, 0, signalsBufSize)); + NCCLCHECK(ncclGinProxyRegMrSym(ginComm, proxyCtx, proxyCtx->signalsDev, signalsBufSize, + NCCL_PTR_CUDA, NCCL_NET_MR_FLAG_FORCE_SO, + &proxyCtx->signalsMhandle, &proxyCtx->signalsGinHandle)); + + NCCLCHECK(ncclCalloc(&proxyCtx->hostGpuCtx, 1)); + struct ginProxyHostGpuCtx *hostGpuCtx = proxyCtx->hostGpuCtx; + hostGpuCtx->queueSize = queueSize; + size_t queuesLength = hostGpuCtx->queueSize * comm->nRanks; + NCCLCHECK(ncclCalloc(&hostGpuCtx->states, queuesLength)); + NCCLCHECK(ncclCalloc(&hostGpuCtx->cisShadow, comm->nRanks)); + NCCLCHECK(ncclCalloc(&hostGpuCtx->sis, comm->nRanks)); + NCCLCHECK(ncclCalloc(&hostGpuCtx->inlines, queuesLength)); + NCCLCHECK(ncclGinProxyRegMrSym(ginComm, proxyCtx, hostGpuCtx->inlines, + queuesLength * sizeof(uint64_t), NCCL_PTR_HOST, 0, + &hostGpuCtx->inlinesMhandle, &hostGpuCtx->inlinesGinHandle)); + + ncclGinProxyGpuCtx_t devGpuCtx_h; + devGpuCtx_h.nranks = comm->nRanks; + devGpuCtx_h.queueSize = hostGpuCtx->queueSize; + devGpuCtx_h.counters = proxyCtx->countersDev; + devGpuCtx_h.signals = proxyCtx->signalsDev; + NCCLCHECK(ncclCudaCalloc(&devGpuCtx_h.pis, comm->nRanks)); + + // Allocate the GFD queues, CIs, counters, signals and test/wait variables on the either the CPU + // or GPU. + NCCLCHECK(allocMemCPUAccessible(&hostGpuCtx->queues, &devGpuCtx_h.queues, queuesLength, 0, + NULL, true /*forceHost*/)); + NCCLCHECK(allocMemCPUAccessible(&hostGpuCtx->cis, &devGpuCtx_h.cis, comm->nRanks, + CU_MEMHOSTALLOC_WRITECOMBINED, &hostGpuCtx->cisGdrHandle)); + + ncclGinProxyGpuCtx_t *devGpuCtx_d = NULL; + NCCLCHECK(ncclCudaCalloc(&devGpuCtx_d, 1)); + // Copy the proxy's devGpuCtx to the GPU + NCCLCHECK(ncclCudaMemcpy(devGpuCtx_d, &devGpuCtx_h, 1)); + + ncclNetDeviceHandle_v11_t *devHandle = NULL; + NCCLCHECK(ncclCalloc(&devHandle, 1)); + devHandle->netDeviceType = NCCL_NET_DEVICE_GIN_PROXY; + devHandle->netDeviceVersion = NCCL_GIN_PROXY_VERSION; + devHandle->handle = (void *)devGpuCtx_d; + devHandle->size = 0; + devHandle->needsProxyProgress = 1; + + proxyCtx->devHandle = devHandle; + + *outDevHandle = devHandle; + *outGinCtx = proxyCtx; + + return ncclSuccess; +} + +ncclResult_t ncclGinProxyRegister(ncclGin_t *ginComm, void *ginCtx, void *addr, size_t size, + int type, int mr_flags, void **mhandle, void **ginHandle) { + struct ginProxyCtx *ctx = (struct ginProxyCtx *)ginCtx; + // Register the memory region with the GIN plugin + NCCLCHECK(ncclGinProxyRegMrSym(ginComm, ctx, addr, size, type, mr_flags, mhandle, ginHandle)); + return ncclSuccess; +} + +ncclResult_t ncclGinProxyDeregister(ncclGin_t *ginComm, void *ginCtx, void *mhandle) { + struct ginProxyCtx *ctx = (struct ginProxyCtx *)ginCtx; + // Deregister the memory region with the GIN plugin + NCCLCHECK(ginComm->deregMrSym(ctx->collComm, mhandle)); + return ncclSuccess; +} + +ncclResult_t ncclGinProxyDestroyContext(ncclGin_t *ginComm, void *ginCtx) { + if (!ginCtx) return ncclSuccess; + struct ginProxyCtx *ctx = (struct ginProxyCtx *)ginCtx; + + // Free counters + if (ctx) { + if (ctx->counters || ctx->countersGdrHandle) + freeMemCPUAccessible(ctx->counters, ctx->countersGdrHandle); + + // Free signals + if (ginComm && ctx->collComm && ctx->signalsMhandle) + ginComm->deregMrSym(ctx->collComm, ctx->signalsMhandle); + if (ctx->signalsDev) ncclCudaFree(ctx->signalsDev); + + // Free hostGpuCtx and its allocations + struct ginProxyHostGpuCtx *hostGpuCtx = ctx->hostGpuCtx; + if (hostGpuCtx) { + if (hostGpuCtx->cisShadow) free(hostGpuCtx->cisShadow); + if (hostGpuCtx->sis) free(hostGpuCtx->sis); + if (hostGpuCtx->states) free(hostGpuCtx->states); + if (hostGpuCtx->inlines) free(hostGpuCtx->inlines); + if (ginComm && ctx->collComm && hostGpuCtx->inlinesMhandle) + ginComm->deregMrSym(ctx->collComm, hostGpuCtx->inlinesMhandle); + if (hostGpuCtx->queues) freeMemCPUAccessible(hostGpuCtx->queues, NULL); + if (hostGpuCtx->cis || hostGpuCtx->cisGdrHandle) + freeMemCPUAccessible(hostGpuCtx->cis, hostGpuCtx->cisGdrHandle); + free(hostGpuCtx); + } + + ncclNetDeviceHandle_v11_t *devHandle = (ncclNetDeviceHandle_v11_t *)ctx->devHandle; + if (devHandle) { + if (devHandle->handle) ncclCudaFree((void *)devHandle->handle); + free(devHandle); + } + + free(ctx); + } + + return ncclSuccess; +} + +ncclResult_t ncclGinProxyProgress(ncclGin_t *ginComm, void *ginCtx) { + struct ginProxyCtx *ctx = (struct ginProxyCtx *)ginCtx; + + NCCLCHECK(proxyGinPollCompletions(ginComm, ctx->collComm, ctx, ctx->hostGpuCtx)); + for (int targetRank = 0; targetRank < ctx->comm->nRanks; targetRank++) { + // Poll on the GFD queue + ncclGinProxyGfd_t gfd; + struct ginProxyGfdState *state = NULL; + if (proxyGinPollGfd(ctx, ctx->hostGpuCtx, targetRank, &gfd, &state)) { + ncclResult_t ret = + proxyGinProcessGfd(ginComm, ctx->collComm, ctx, ctx->hostGpuCtx, targetRank, &gfd, state); + if (ret) ctx->hasError = ret; + NCCLCHECK(ret); + } + if (ginComm->ginProgress) ginComm->ginProgress(ctx->collComm); + } + + return ncclSuccess; +} + +ncclResult_t ncclGinProxyQueryLastError(ncclGin_t *ginComm, void *ginCtx, bool *hasError) { + struct ginProxyCtx *ctx = (struct ginProxyCtx *)ginCtx; + *hasError = ctx->hasError; + return ncclSuccess; +} diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 86d185bc0..253c57489 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -266,14 +266,18 @@ ncclResult_t ncclGetUserP2pLevel(int* level) { return ncclSuccess; } +// Tests two ranks for CUDA P2P connectivity. +// *cudaP2p returns 1 if CUDA P2P between the ranks is supported. +// *p2p returns 1 only if the distance between the ranks is no greater than NCCL_P2P_LEVEL. The connection may go through an intermediate rank. ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, - int* p2p, int *read, int* intermediateRank) { + int* p2p, int *read, int* intermediateRank, int* cudaP2p) { int mnnvl = 0; struct ncclPeerInfo* info1 = NULL; struct ncclPeerInfo* info2 = NULL; *p2p = 0; if (read) *read = 0; if (intermediateRank) *intermediateRank = -1; + if (cudaP2p) *cudaP2p = 0; // Rule out different nodes / isolated containers if (comm) { @@ -325,11 +329,13 @@ ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* syst // Compute the PCI distance and compare with the p2pLevel. if (path->type <= p2pLevel) *p2p = 1; + // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to + // validate against NVML at all since they are pretending to be on other hw. + bool checkNvml = (ncclParamIgnoreDisabledP2p() != 2 && g1 != g2 && + (comm == NULL || (info1->hostHash == comm->peerInfo[comm->rank].hostHash && + info1->hostHash == info2->hostHash))); if (*p2p == 1) { - // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to - // validate against NVML at all since they are pretending to be on other hw. - if (g1 != g2 && (comm == NULL || (info1->hostHash == comm->peerInfo[comm->rank].hostHash && - info1->hostHash == info2->hostHash)) && ncclParamIgnoreDisabledP2p() != 2) { + if (checkNvml) { int indexes[3] = {-1,-1,-1}; int verticeN = 0; NCCLCHECK(ncclNvmlEnsureInitialized()); @@ -365,6 +371,19 @@ ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* syst if (read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1; } + if (cudaP2p) { + if (checkNvml) { + int n1, n2; + n1 = system->nodes[GPU].nodes[g1].gpu.dev; + n2 = system->nodes[GPU].nodes[g2].gpu.dev; + *cudaP2p = (ncclNvmlDevicePairs[n1][n2].p2pStatusRead == NVML_P2P_STATUS_OK && + ncclNvmlDevicePairs[n1][n2].p2pStatusWrite == NVML_P2P_STATUS_OK); + } else { + // We assume P2P connectivity in case the ranks are connected using MNNVL or are on the same host. + *cudaP2p = (mnnvl || comm == NULL || info1->hostHash == info2->hostHash); + } + } + return ncclSuccess; } @@ -591,7 +610,7 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, struct ncclTopoSystem* system = comm->topo; *nranks = 0; *intermediateRanks = NULL; - if (system->nodes[NET].count == 0) return ncclSuccess; + if (system->inter == 0) return ncclSuccess; int nr = 0; int* ranks = NULL; @@ -650,7 +669,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm for (int p=0; pnodes[GPU].count; p++) { int p2p; NCCLCHECK(ncclTopoCheckP2p(comm, system, system->nodes[GPU].nodes[p].gpu.rank, - system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL)); + system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL, NULL)); if (p2p == 0) { // Divert all traffic through the CPU int cpu; @@ -780,10 +799,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* NCCLCHECKGOTO(ncclTopoRemoveNode(system, GPU, g), ret, fail); } - if (system->nodes[GPU].count == comm->nRanks) { - for (int n=system->nodes[NET].count-1; n>=0; n--) - NCCLCHECKGOTO(ncclTopoRemoveNode(system, NET, n), ret, fail); - } + system->inter = system->nodes[GPU].count == comm->nRanks ? 0 : 1; exit: free(domains); if (ids) free(ids); diff --git a/src/graph/rings.cc b/src/graph/rings.cc index 5d967abb9..70fac75b1 100644 --- a/src/graph/rings.cc +++ b/src/graph/rings.cc @@ -26,6 +26,11 @@ void dumpLine(int* values, int nranks, const char* prefix) { } ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) { + ncclResult_t ret = ncclSuccess; + uint64_t* rankFound; + int rankFoundSize = DIVUP(nranks, 64); + NCCLCHECK(ncclCalloc(&rankFound, rankFoundSize)); + for (int r=0; rmaxBw = 0.0; system->totalBw = 0.0; - int inter = system->nodes[NET].count; + int inter = system->inter; if (inter == 0 && system->nodes[GPU].count == 1) { system->maxBw = LOC_BW; system->totalBw = LOC_BW; @@ -496,14 +496,14 @@ static ncclResult_t ncclTopoPrefNetsChannelFirst(struct ncclTopoSystem* system, return ncclSuccess; } -// Build a sorted list of the NETs to try. +// Build a sorted list of the NETs to try, the list will follow the NETDEVS_POLICY set by the user. // -// "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu -// index when trying to get back to the NIC. +// The value of "gpu" can be set to -1 to build a list suitable for all GPUs (for example for the search start). +// The value of "gpu" can be set to the desired index when trying to get back to the NIC. // // The list is built the following way: -// 1. Select NETs starting with those close to GPU(s), based on paths[n].type. -// 2. add other NETs satisfying typeInter but not already in the list. +// 1. First gather the preferred NETs for each of the GPU(s), based on the NETDEVS_POLICY and the connection. +// 2. If the NETDEV_policy allows it, add all the other NETs satisfying typeInter but not already in the list of preferred NETs. NCCL_PARAM(ScatterEnable, "MNNVL_SCATTER_NETS_ENABLE", 1); ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet) { ncclResult_t ret = ncclSuccess; @@ -518,9 +518,19 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in NCCLCHECK(ncclTopoPrefNetsChannelFirst(system, gpu, nets, &netCount)); } + // Get the maximum of network devices allowed, depending on the policy. + // If the policy is not MAX, then allow all devices. + int maxDevCount = 0; + enum netDevsPolicy netDevsPolicy; + NCCLCHECK(ncclTopoGetNetDevsPolicy(&netDevsPolicy, &maxDevCount)); + if (gpu == -1) maxDevCount *= system->nodes[GPU].count; + if (netDevsPolicy != NETDEVS_POLICY_MAX) maxDevCount = NCCL_TOPO_MAX_NODES; + if (netCount >= maxDevCount) goto exit; + // Then add others satisfying typeInter for (int t=0; t <= typeInter; t++) { for (int g = 0; g < system->nodes[GPU].count; g++) { + // do not consider this GPU is it's not the GPU we asked for if (gpu != -1 && gpu != g) continue; int localNetCount = 0, localNets[MAXCHANNELS]; struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; @@ -532,16 +542,37 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in for (int i=0; i= maxDevCount) goto exit; } } } +exit: *netCountRet = netCount; return ret; } +NCCL_PARAM(MnnvlRailPerHost, "MNNVL_RAIL_PER_HOST", 0); + +static bool ncclTopoSearchCheckNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoNode* startNet, int n, int step) { + struct ncclTopoNode* net = system->nodes[NET].nodes+n; + if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) return false; // Trees are symmetric + if (graph->pattern == NCCL_TOPO_PATTERN_RING && graph->crossNic == 2) { + if (graph->nChannels & 1 && net->id != graph->inter[(graph->nChannels - 1) * 2]) return false; + } else if (graph->crossNic == 0) { + if (ncclParamMnnvlRailPerHost() && NCCL_TOPO_ID_SYSTEM_ID(net->id) != NCCL_TOPO_ID_SYSTEM_ID(startNet->id)) { + // Different hosts in an MNNVL system: rail are per host and identified with the PCI id. + if (net->net.pciId != startNet->net.pciId || net->net.port != startNet->net.port) return false; + } else { + if (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port) return false; + } + } + if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE && step != 0 && net->id != graph->inter[graph->nChannels*2+1]) return false; + return true; +} + ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) { if ((*time) <= 0) return ncclSuccess; (*time)--; @@ -567,7 +598,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo int nets[NCCL_TOPO_MAX_NODES]; if (step == backToNet) { // first get back to NIC - if (system->nodes[NET].count) { + if (system->inter) { int startNetIndex; NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex)); struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex; @@ -575,24 +606,17 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount)); for (int i=0; inodes[NET].nodes+n; - if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric - if (graph->pattern == NCCL_TOPO_PATTERN_RING && graph->crossNic == 2) { - if (graph->nChannels & 1 && net->id != graph->inter[(graph->nChannels-1)*2]) continue; - } else { - if (graph->crossNic == 0 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue; - } - + if (!ncclTopoSearchCheckNet(system, graph, startNet, n, step)) continue; // Balanced Tree : count half of the bandwidth on first two GPUs int nextBackToNet = -1; float bwInterSave = graph->bwInter; if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) { // Count half of the bandwidth on each of the first two GPUs if (step == 0) nextBackToNet = 1; - else if (net->id != graph->inter[graph->nChannels*2+1]) continue; graph->bwInter /= 2; } + struct ncclTopoNode* net; NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net)); graph->bwInter = bwInterSave; if (net) { @@ -744,7 +768,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo * `--> NET n (or m if crossNic) */ ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) { - if (system->nodes[NET].count) { + if (system->inter) { if (pattern == NCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count-1; else if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) *backToNet = 1; else *backToNet = 0; @@ -760,7 +784,7 @@ ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, in ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time) { int backToNet, backToFirstRank; NCCLCHECK(ncclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank)); - if (system->nodes[NET].count) { + if (system->inter) { // Start from NET ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time); } else { @@ -876,7 +900,7 @@ ncclResult_t ncclTopoGetXmlFromChannel(struct ncclTopoGraph* graph, int c, struc int* intra = graph->intra+ngpus*c; NCCLCHECK(xmlAddNode(xml, parent, "channel", &xmlChannel)); struct ncclXmlNode* node; - if (system->nodes[NET].count) { + if (system->inter) { NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node)); NCCLCHECK(xmlSetAttrLong(node, "dev", inter[0])); } @@ -896,7 +920,7 @@ ncclResult_t ncclTopoGetXmlFromChannel(struct ncclTopoGraph* graph, int c, struc NCCLCHECK(xmlSetAttrLong(node, "dev", dev)); if (graph->id == 3) break; // NVLS graphs only use the first GPU } - if (system->nodes[NET].count) { + if (system->inter) { NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node)); NCCLCHECK(xmlSetAttrLong(node, "dev", inter[1])); } @@ -979,7 +1003,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph NCCLCHECK(ncclTopoGetGpuMinPath(system, GPU, &minTypeIntra)); NCCLCHECK(ncclTopoGetGpuMaxPath(system, GPU, &maxTypeIntra)); } - if (system->nodes[NET].count > 0) { + if (system->inter) { NCCLCHECK(ncclTopoGetGpuMinPath(system, NET, &minTypeInter)); NCCLCHECK(ncclTopoGetGpuMaxPath(system, NET, &maxTypeInter)); maxTypeIntra = maxTypeInter; @@ -1016,7 +1040,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE; - if (system->nodes[NET].count == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) { + if (system->inter == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) { // Force intra-node NVLS algorithm to pull evenly from all GPUs. graph->minChannels = graph->maxChannels; } @@ -1036,7 +1060,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph // First try crossnic, then decrease bw and finally increase bwIntra. int nspeeds = 0; float* speedArray = NULL; - if (system->nodes[NET].count == 0) { + if (system->inter == 0) { nspeeds = ccMin >= 100 ? NSPEEDSINTRA_SM100 : (ccMin >= 90 ? NSPEEDSINTRA_SM90 : NSPEEDSINTRA); speedArray = ccMin >= 100 ? sm100SpeedArrayIntra : (ccMin >= 90 ? sm90SpeedArrayIntra : speedArrayIntra); } else { @@ -1096,14 +1120,14 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph } tmpGraph.pattern = graph->pattern; - int maxIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : maxTypeIntra; + int maxIntra = system->inter ? tmpGraph.typeInter : maxTypeIntra; if (tmpGraph.typeIntra < maxIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) { tmpGraph.typeIntra += 1; if (tmpGraph.typeIntra < PATH_DIS) goto search; } tmpGraph.typeIntra = minTypeIntra; - if (system->nodes[NET].count > 0 && tmpGraph.typeInter < maxTypeInter && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) { + if (system->inter && tmpGraph.typeInter < maxTypeInter && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) { tmpGraph.typeInter += 1; if (tmpGraph.typeInter < PATH_DIS) goto search; } @@ -1181,7 +1205,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr for (int c=0; cnChannels; c++) { sprintf(line, "%2d :", c); int offset = strlen(line); - if (system->nodes[NET].count > 0) { + if (system->inter) { sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c])); offset = strlen(line); } @@ -1193,7 +1217,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr offset = strlen(line); if (graph->id == 3) break; // NVLS graphs only use the first GPU } - if (system->nodes[NET].count > 0) { + if (system->inter) { sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c+1])); offset = strlen(line); } diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 3a87725f1..be533b57f 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -357,25 +357,38 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s int dev; NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev)); + int64_t netId = NCCL_TOPO_ID(systemId, dev); struct ncclTopoNode* net; - NCCLCHECK(ncclTopoCreateNode(system, &net, NET, NCCL_TOPO_ID(systemId, dev))); + NCCLCHECK(ncclTopoCreateNode(system, &net, NET, netId)); net->net.dev = dev; const char* str; + // if not guid is present use the net->id unique id instead, which will be unique within the node/NVLD NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str)); - if (str) sscanf(str, "0x%lx", &net->net.asic); - else net->net.asic = dev; + net->net.asic = (str) ? strtoull(str, NULL, 16) : netId; + - ncclDebugNoWarn = NCCL_GRAPH; int mbps; - NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0)); + NCCLCHECKNOWARN(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0), NCCL_GRAPH); if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1 net->net.bw = mbps / 8000.0; - if (xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != ncclSuccess) net->net.latency = 0; - NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0)); - NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0)); - NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS)); - NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0)); - ncclDebugNoWarn = 0; + ncclResult_t ret; + NOWARN(ret = xmlGetAttrFloat(xmlNet, "latency", &net->net.latency), NCCL_GRAPH); + if (ret != ncclSuccess) net->net.latency = 0; + NCCLCHECKNOWARN(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0), NCCL_GRAPH); + NCCLCHECKNOWARN(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0), NCCL_GRAPH); + NCCLCHECKNOWARN(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS), NCCL_GRAPH); + NCCLCHECKNOWARN(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0), NCCL_GRAPH); + + // build the PCI id using the parent PCI link + uint64_t hacc[2] = {1, 1}; + const char* busId = NULL; + struct ncclXmlNode* parent = xmlNet->parent; + while (parent != NULL && strcmp(parent->name, "pci") != 0) parent = parent->parent; + if (parent) NCCLCHECK(xmlGetAttr(parent, "busid", &busId)); + // If we fail to find the PCIe path, we use the GUID instead. + if (busId) eatHash(hacc, busId, strlen(busId)); + else eatHash(hacc, &net->net.asic); + net->net.pciId = digestHash(hacc); NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.bw)); NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.bw)); @@ -998,7 +1011,8 @@ ncclResult_t ncclTopoMakeVnic(struct ncclXml* xml, struct ncclTopoNetInfo* netIn // Trigger the merge, then get the new device's properties int vDevIndex = 0; - ncclResult_t ret = netInfo->makeVDevice(&vDevIndex, vProps); + ncclResult_t ret; + NOWARN(ret = netInfo->makeVDevice(&vDevIndex, vProps), NCCL_GRAPH|NCCL_INIT|NCCL_NET); if (ret != ncclSuccess) { INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "TOPO/NET : Tried merging multiple devices together and failed. vProps={ndevs=%d, devs=[%d %d %d %d]}. Set NCCL_NET_MERGE_LEVEL=LOC to disable NIC fusion.", vProps->ndevs, vProps->devs[0], vProps->devs[1], vProps->devs[2], vProps->devs[3]); @@ -1582,16 +1596,8 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c return ncclSuccess; } -enum netDevsPolicy { - NETDEVS_POLICY_AUTO = 0x0, - NETDEVS_POLICY_ALL = 0x1, - NETDEVS_POLICY_MAX = 0x2, - NETDEVS_POLICY_UNDEF = 0xffffffff -}; - -static enum netDevsPolicy netDevsPolicy = NETDEVS_POLICY_UNDEF; static int netDevsPolicyNum = -1; - +static enum netDevsPolicy netDevsPolicy = NETDEVS_POLICY_UNDEF; static void getNetDevsPolicyOnce() { const char* envStr = ncclGetEnv("NCCL_NETDEVS_POLICY"); if (envStr) { @@ -1614,6 +1620,18 @@ static void getNetDevsPolicyOnce() { if (netDevsPolicy == NETDEVS_POLICY_UNDEF) netDevsPolicy = NETDEVS_POLICY_AUTO; } +ncclResult_t ncclTopoGetNetDevsPolicy(enum netDevsPolicy* policy, int* policyNum) { + static pthread_once_t onceNetDevsPolicy = PTHREAD_ONCE_INIT; + pthread_once(&onceNetDevsPolicy, getNetDevsPolicyOnce); + if (netDevsPolicy == NETDEVS_POLICY_MAX && netDevsPolicyNum <= 0) { + WARN("Invalid number of network devices = %d for policy MAX", netDevsPolicyNum); + return ncclInternalError; + } + if (policy) *policy = netDevsPolicy; + if (policyNum && netDevsPolicyNum >= 0) *policyNum = netDevsPolicyNum; + return ncclSuccess; +} + ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) { int gpu; NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu, /*showWarn=*/true)); @@ -1626,22 +1644,19 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch return ncclInternalError; } - static pthread_once_t once = PTHREAD_ONCE_INIT; - pthread_once(&once,getNetDevsPolicyOnce); int netsPerGpu = 0; - if (netDevsPolicy == NETDEVS_POLICY_AUTO) { + int policyCount = 0; + enum netDevsPolicy policy; + NCCLCHECK(ncclTopoGetNetDevsPolicy(&policy, &policyCount)); + if (policy == NETDEVS_POLICY_AUTO) { int localGpus[NCCL_TOPO_MAX_NODES]; int localGpuCount; NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL)); netsPerGpu = DIVUP(localNetCount, localGpuCount); - } else if (netDevsPolicy == NETDEVS_POLICY_ALL) { + } else if (policy == NETDEVS_POLICY_ALL) { netsPerGpu = localNetCount; - } else if (netDevsPolicy == NETDEVS_POLICY_MAX) { - if (netDevsPolicyNum <= 0) { - WARN("Invalid number of network devices = %d for policy MAX", netDevsPolicyNum); - return ncclInternalError; - } - netsPerGpu = std::min(netDevsPolicyNum, localNetCount); + } else if (policy == NETDEVS_POLICY_MAX) { + netsPerGpu = std::min(policyCount, localNetCount); } else { WARN("Unknown netDevs policy"); return ncclInternalError; @@ -1655,6 +1670,21 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch return ncclSuccess; } +ncclResult_t ncclTopoGetLocalNets(struct ncclTopoSystem* system, int rank, int64_t* localNets, int* localNetCount) { + int gpu; + NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu, /*showWarn=*/true)); + int localNetIndexes[NCCL_TOPO_MAX_NODES]; + NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNetIndexes, localNetCount, NULL)); + + if (*localNetCount == 0) { + WARN("Could not find any local path from gpu %d to net.", gpu); + return ncclInternalError; + } + // Convert index to ids + for (int n=0; n<*localNetCount; n++) localNets[n] = system->nodes[NET].nodes[localNetIndexes[n]].id; + return ncclSuccess; +} + ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex) { ncclResult_t ret = ncclSuccess; int netIndex; diff --git a/src/graph/topo.h b/src/graph/topo.h index 49d408d95..f153620e7 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -138,6 +138,7 @@ struct ncclTopoNode { }gpu; struct { int dev; // Plugin dev number + uint64_t pciId; uint64_t asic; int port; float bw; @@ -177,6 +178,7 @@ struct ncclTopoSystem { struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES]; float maxBw; float totalBw; + int inter; }; ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id); diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index bfb279850..0520e7234 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -176,7 +176,7 @@ static const ncclTunerConstants_t ncclTunerConstantsDefaults = { {20.0, 20.0, 20.0}, /* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0}, /* Ampere (N1/N2/N4) */ {36.7, 36.7, 36.7}, /* Hopper (N1/N2/N4) */ - {2*36.7, 2*36.7, 2*36.7}, /* Blackwell (N1/N2/N4) */ + {2*36.7, 34.6, 2*36.7}, /* Blackwell (N1/N2/N4) */ }, .perChMaxTreeLL128Bws = { {20.0, 20.0, 20.0}, /* Volta (N1/N2/N4) */ diff --git a/src/graph/xml.cc b/src/graph/xml.cc index 010120627..3ab8e20dd 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -575,32 +575,28 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* const char* busId; NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId)); char* path = NULL; - ncclDebugNoWarn = NCCL_GRAPH; - getPciPath(busId, &path); - ncclDebugNoWarn = 0; + NOWARN(getPciPath(busId, &path), NCCL_GRAPH); if (path) { NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class")); } int index; - ncclDebugNoWarn = NCCL_GRAPH; - NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index)); + NCCLCHECKNOWARN(xmlGetAttrIndex(pciNode, "vendor", &index), NCCL_GRAPH); if (index == -1) { - if (path) ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor"); + if (path) NOWARN(ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor"), NCCL_GRAPH); } - NCCLCHECK(xmlGetAttrIndex(pciNode, "device", &index)); + NCCLCHECKNOWARN(xmlGetAttrIndex(pciNode, "device", &index), NCCL_GRAPH); if (index == -1) { - if (path) ncclTopoSetAttrFromSys(pciNode, path, "device", "device"); + if (path) NOWARN(ncclTopoSetAttrFromSys(pciNode, path, "device", "device"), NCCL_GRAPH); } - NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index)); + NCCLCHECKNOWARN(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index), NCCL_GRAPH); if (index == -1) { - if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor"); + if (path) NOWARN(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor"), NCCL_GRAPH); } - NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_device", &index)); + NCCLCHECKNOWARN(xmlGetAttrIndex(pciNode, "subsystem_device", &index), NCCL_GRAPH); if (index == -1) { - if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device"); + if (path) NOWARN(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device"), NCCL_GRAPH); } - ncclDebugNoWarn = 0; NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index)); if (index == -1) { if (path) { @@ -635,7 +631,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* NCCLCHECK(xmlGetAttr(pciNode, "vendor", &vendor)); if (vendor != NULL && strcmp(vendor, "0x1000") == 0) { // BCM switch, look for P2P connections int nlinks; - char* peers; + char* peers = NULL; NCCLCHECK(getBcmLinks(busId, &nlinks, &peers)); for (int l=0; lparent; @@ -868,9 +865,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm const char* busId; NCCLCHECK(xmlGetAttr(sub, "target", &busId)); char* path; - ncclDebugNoWarn = NCCL_GRAPH; - getPciPath(busId, &path); - ncclDebugNoWarn = 0; + NOWARN(getPciPath(busId, &path), NCCL_GRAPH); if (path == NULL || strcmp(busId, "fffffff:ffff:ff") == 0) { // Remote NVLink device is not visible inside this VM. Assume NVSwitch. NCCLCHECK(xmlSetAttr(sub, "tclass", "0x068000")); diff --git a/src/include/allocator.h b/src/include/allocator.h index 05da29a62..eccb5b5cd 100644 --- a/src/include/allocator.h +++ b/src/include/allocator.h @@ -7,6 +7,10 @@ #ifndef NCCL_ALLOCATOR_H_ #define NCCL_ALLOCATOR_H_ +#include "nccl.h" +#include +#include + //////////////////////////////////////////////////////////////////////////////// // ncclSpace: Allocates contiguous segments of non-negative integers. Useful // as a memory allocator when we can't put allocator state within the memory diff --git a/src/include/channel.h b/src/include/channel.h index bd34f54c1..d5058c4f3 100644 --- a/src/include/channel.h +++ b/src/include/channel.h @@ -19,9 +19,10 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRa inline uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound) { int base; if (comm->nNodes > 1) { - int nodeDelta = p2pRound/comm->maxLocalRanks; - int localDelta = p2pRound%comm->maxLocalRanks; - base = nodeDelta*divUp(comm->maxLocalRanks, NCCL_MAX_DEV_WORK_P2P_PER_BATCH); + int localSize = comm->p2pSchedGroupSize; + int groupDelta = p2pRound / localSize; + int localDelta = p2pRound % localSize; + base = groupDelta*divUp(localSize, NCCL_MAX_DEV_WORK_P2P_PER_BATCH); base += localDelta/NCCL_MAX_DEV_WORK_P2P_PER_BATCH; } else { base = p2pRound; diff --git a/src/include/checks.h b/src/include/checks.h index cbb5a2de4..16f515516 100644 --- a/src/include/checks.h +++ b/src/include/checks.h @@ -137,6 +137,21 @@ } \ } while (0) +#define NCCLCHECKNOWARN(call, FLAGS) do { \ + ncclResult_t RES; \ + NOWARN(RES = call, FLAGS); \ + if (RES != ncclSuccess && RES != ncclInProgress) { \ + return RES; \ + } \ +} while (0) + +#define NCCLCHECKGOTONOWARN(call, RES, label, FLAGS) do { \ + NOWARN(RES = call, FLAGS); \ + if (RES != ncclSuccess && RES != ncclInProgress) { \ + goto label; \ + } \ +} while (0) + #define NCCLWAIT(call, cond, abortFlagPtr) do { \ uint32_t* tmpAbortFlag = (abortFlagPtr); \ ncclResult_t RES = call; \ diff --git a/src/include/comm.h b/src/include/comm.h index 22faf3682..e1b37db16 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -140,6 +140,9 @@ struct ncclSharedResources { /* proxy related shared res */ struct ncclProxyState* proxyState; + + // GIN state + struct ncclGinState ginState; }; struct ncclChannel { @@ -455,6 +458,7 @@ struct ncclComm { ncclNet_t* ncclNet; void* netContext; + void* ginContext; int netPluginIndex; int ncclNetVer; ncclNetDeviceType netDeviceType; @@ -468,7 +472,7 @@ struct ncclComm { int maxTreePattern; bool initAlgoChannels[NCCL_NUM_ALGORITHMS]; bool runtimeConn; // if dynamic connection is supported - bool directMode; + bool directMode; // if any process manages more than one local rank int cuMemSupport; uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches. @@ -523,6 +527,7 @@ struct ncclComm { // Channels (per peer) for p2p int p2pnChannels; int p2pnChannelsPerPeer; + int p2pSchedGroupSize; // Should this comm allocate LL buffers for network P2P connections? bool allocP2pNetLLBuffers; @@ -550,6 +555,7 @@ struct ncclComm { uint32_t* childAbortFlag; uint32_t* childAbortFlagDev; uint32_t destroyFlag; + uint32_t revokedFlag; // Device side of the communicator (for cudaFree's) struct ncclKernelComm* devComm; // actually = &ncclKernelCommAndChannels::comm @@ -651,11 +657,12 @@ struct ncclComm { // CE Collective struct ncclCeColl ceColl; struct ncclIntruQueue ceInitTaskQueue; - + // buffer registration cache struct ncclRegCache regCache; int isAllNvlink; - bool isAllDirectP2p; + bool isAllDirectP2p; // Subject to NCCL_P2P_LEVEL (for local ranks only). + bool isAllCudaP2p; // Raw CUDA capability (for local ranks only). int symmetricSupport; bool useNetPXN; bool useGdr; diff --git a/src/include/debug.h b/src/include/debug.h index 3822e8760..f332d749d 100644 --- a/src/include/debug.h +++ b/src/include/debug.h @@ -29,6 +29,14 @@ extern char ncclLastError[]; #define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) +#define NOWARN(EXPR, FLAGS) \ + do { \ + int oldNoWarn = ncclDebugNoWarn; \ + ncclDebugNoWarn = FLAGS; \ + (EXPR); \ + ncclDebugNoWarn = oldNoWarn; \ + } while(0) + #define INFO(FLAGS, ...) \ do{ \ int level = __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE); \ diff --git a/src/include/dev_runtime.h b/src/include/dev_runtime.h index 5f6e66e33..70bf77496 100644 --- a/src/include/dev_runtime.h +++ b/src/include/dev_runtime.h @@ -52,6 +52,7 @@ struct ncclDevrState { int* lsaRankList; size_t granularity; // cuMemGetAllocationGranularity + bool ginEnabled; struct ncclDevrMemory* memHead; struct ncclDevrWindowSorted* winSorted; int winSortedCapacity, winSortedCount; diff --git a/src/include/device.h b/src/include/device.h index 9ffc26095..b1cef15b3 100644 --- a/src/include/device.h +++ b/src/include/device.h @@ -53,7 +53,7 @@ extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS]; #define NCCL_CUDA_ARCH_FAMILY_SPECIFIC 0 #endif -#include "net_device.h" +#include "nccl_device/net_device.h" enum ncclDevRedOp_t { ncclDevSum, ncclDevProd, ncclDevMinMax, @@ -153,6 +153,7 @@ struct ncclProxyConnector { int sameProcess; struct ncclProxyConnection* connection; ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*); // Copied from transport if necessary + ncclResult_t (*proxyGinProgress)(struct ncclProxyState* proxyState); }; struct ncclConnector { @@ -528,7 +529,8 @@ __host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ // Host-side table of kernel function pointers. extern int const ncclDevKernelCount; -extern void* const ncclDevKernelList[/*ncclDevKernelCount*/]; +extern void* ncclDevKernelList[/*ncclDevKernelCount*/]; +extern int ncclDevKernelRequirements[/*ncclDevKernelCount*/]; // Table of most specialized kernel function to run given func index. extern int const ncclDevFuncRowToId[]; diff --git a/src/include/env.h b/src/include/env.h new file mode 100644 index 000000000..0e00b3144 --- /dev/null +++ b/src/include/env.h @@ -0,0 +1,23 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_INT_ENV_H_ +#define NCCL_INT_ENV_H_ + +#include "nccl_env.h" + +// Initialize Env Plugin +ncclResult_t ncclEnvPluginInit(void); +// Finalize Env Plugin +void ncclEnvPluginFinalize(void); +// Env plugin get function for NCCL params, called in ncclGetEnv() +const char* ncclEnvPluginGetEnv(const char* name); + +bool ncclEnvPluginInitialized(void); + +ncclResult_t ncclInitEnv(void); + +#endif diff --git a/src/include/gin/gin_host.h b/src/include/gin/gin_host.h new file mode 100644 index 000000000..d82a79505 --- /dev/null +++ b/src/include/gin/gin_host.h @@ -0,0 +1,54 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_GIN_HOST_H_ +#define _NCCL_GIN_HOST_H_ + +#include "allocator.h" +#include "nccl.h" +#include "nccl_net.h" +#include "nccl_device/gin/gin_device_host_common.h" +#include + +struct ncclGinState { + ncclGin_t* ncclGin; + void* ginInstance; + bool connected; + int ginType; + int ginCommCount; + void* ginComms[NCCL_GIN_MAX_CONTEXTS]; + void* ginCtx[NCCL_GIN_MAX_CONTEXTS]; + ncclNetDeviceHandle_t* ginDevHandles[NCCL_GIN_MAX_CONTEXTS]; + int needsProxyProgress; // Whether we need to progress GIN operations with the proxy + int ginProgress; // GIN progress is enabled + pthread_t thread; + pthread_mutex_t threadLock; + pthread_cond_t threadCond; + ncclResult_t asyncResult; + + int signalSpaceSize; + int counterSpaceSize; + ncclSpace signalSpace; + ncclSpace counterSpace; +}; + +extern int64_t ncclParamGinType(); + +// FIXME change to ncclGinState instead of ncclComm, no need to pass comm +ncclResult_t ncclGinConnectOnce(struct ncclComm* comm); +ncclResult_t ncclGinFinalize(struct ncclComm* comm); +ncclResult_t ncclGinProgress(struct ncclGinState* ginState); +ncclResult_t ncclGinRegister(struct ncclComm* comm, void* address, size_t size, + void* ginHostWins[NCCL_GIN_MAX_CONTEXTS], + ncclGinWindow_t ginDevWins[NCCL_GIN_MAX_CONTEXTS]); +ncclResult_t ncclGinDeregister(struct ncclComm* comm, void* ginHostWins[NCCL_GIN_MAX_CONTEXTS]); +ncclResult_t ncclGinAllocSignalsCounters(struct ncclComm* comm, int nSignals, uint32_t* outSignal0, + int nCounters, uint32_t* outCounter0); +ncclResult_t ncclGinFreeSignalsCounters(struct ncclComm* comm, uint32_t signal0, int nSignals, + uint32_t counter0, int nCounters); +ncclResult_t ncclGinQueryLastError(struct ncclGinState* ginState, bool* hasError); + +#endif diff --git a/src/include/gin/gin_host_proxy.h b/src/include/gin/gin_host_proxy.h new file mode 100644 index 000000000..14e8b93ca --- /dev/null +++ b/src/include/gin/gin_host_proxy.h @@ -0,0 +1,28 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef GIN_HOST_PROXY_H_ +#define GIN_HOST_PROXY_H_ + +#include +#include +#include +#include +#include "nccl.h" +#include "gin/gin_host.h" +#include "plugin/nccl_net.h" + +ncclResult_t ncclGinProxyCreateContext(struct ncclComm *comm, void *collComm, int devId, + int nSignals, int nCounters, void **outGinCtx, + ncclNetDeviceHandle_v11_t **outDevHandle); +ncclResult_t ncclGinProxyRegister(ncclGin_t *ginComm, void *ginCtx, void *addr, size_t size, + int type, int mr_flags, void **mhandle, void **ginHandle); +ncclResult_t ncclGinProxyDeregister(ncclGin_t *ginComm, void *ginCtx, void *mhandle); +ncclResult_t ncclGinProxyDestroyContext(ncclGin_t *ginComm, void *ginCtx); +ncclResult_t ncclGinProxyProgress(ncclGin_t *ginComm, void *ginCtx); +ncclResult_t ncclGinProxyQueryLastError(ncclGin_t *ginComm, void *ginCtx, bool *hasError); + +#endif diff --git a/src/include/graph.h b/src/include/graph.h index 6b926717e..203b6a1d1 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -34,7 +34,7 @@ ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm); // Query topology ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank); -ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank); +ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank, int* cudaP2p); ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret); enum ncclTopoGdrMode { ncclTopoGdrModeDisable = 0, @@ -73,9 +73,18 @@ ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count); ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count); ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count); ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev); +ncclResult_t ncclTopoGetLocalNets(struct ncclTopoSystem* system, int rank, int64_t* localNets, int* localNetCount); ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex); ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count); +enum netDevsPolicy { + NETDEVS_POLICY_AUTO = 0x0, + NETDEVS_POLICY_ALL = 0x1, + NETDEVS_POLICY_MAX = 0x2, + NETDEVS_POLICY_UNDEF = 0xffffffff +}; +ncclResult_t ncclTopoGetNetDevsPolicy(enum netDevsPolicy* policy, int* policyNum); + // Allows for up to 32 NICs per node on GB200-NVL72 #define NCCL_TOPO_MAX_NODES 576 ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType); diff --git a/src/include/group.h b/src/include/group.h index 6e317c6c4..3b08d9f16 100644 --- a/src/include/group.h +++ b/src/include/group.h @@ -82,6 +82,10 @@ inline ncclResult_t ncclGroupStartInternal() { return ncclSuccess; } +inline bool ncclGroupEnabled() { + return ncclGroupDepth != 0; +} + inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) { if (ncclGroupDepth > 0) { if (ret != ncclSuccess && ret != ncclInProgress) ncclGroupError = ret; diff --git a/src/include/nccl_device.h b/src/include/nccl_device.h index 88b2531d1..35e216c62 100644 --- a/src/include/nccl_device.h +++ b/src/include/nccl_device.h @@ -4,12 +4,12 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "nccl_device/impl/comm__funcs.h" #include "nccl_device/coop.h" +#include "nccl_device/impl/barrier__funcs.h" +#include "nccl_device/impl/comm__funcs.h" #include "nccl_device/impl/core__funcs.h" #include "nccl_device/impl/ll_a2a__funcs.h" -#include "nccl_device/impl/mem_barrier__funcs.h" -//#include "nccl_device/net_barrier__funcs.h" -//#include "nccl_device/net_scratch_a2a__funcs.h" -//#include "nccl_device/barrier__funcs.h" +#include "nccl_device/impl/lsa_barrier__funcs.h" +#include "nccl_device/impl/gin__funcs.h" +#include "nccl_device/impl/gin_barrier__funcs.h" #include "nccl_device/impl/ptr__funcs.h" diff --git a/src/include/nccl_device/barrier.h b/src/include/nccl_device/barrier.h new file mode 100644 index 000000000..0c11f6e5c --- /dev/null +++ b/src/include/nccl_device/barrier.h @@ -0,0 +1,47 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_BARRIER_H_ +#define _NCCL_DEVICE_BARRIER_H_ +#include "impl/core__types.h" +#include "impl/lsa_barrier__types.h" +#include "impl/gin_barrier__types.h" + +#if __CUDACC__ +template +struct ncclBarrierSession_internal; + +template +struct ncclBarrierSession: ncclBarrierSession_internal { + // Full featured constructor: + NCCL_DEVICE_INLINE ncclBarrierSession( + Coop, ncclTeam innerTeam, ncclTeam outerTeam, ncclGin, + ncclLsaBarrierHandle innerBarHandle, + ncclGinBarrierHandle outerBarHandle, + uint32_t index, + bool multimem=false, ncclMultimemHandle innerMmHandle={} + ); + // Convenience constructors for baked in teams: + NCCL_DEVICE_INLINE ncclBarrierSession( + Coop, ncclTeamTagWorld, ncclGin, uint32_t index, bool multimem=false + ); + NCCL_DEVICE_INLINE ncclBarrierSession( + Coop, ncclTeamTagLsa, ncclDevComm const&, uint32_t index, bool multimem=false + ); + NCCL_DEVICE_INLINE ncclBarrierSession( + Coop, ncclTeamTagRail, ncclGin, uint32_t index + ); + + ncclBarrierSession(ncclBarrierSession const&) = delete; // Sessions are not copyable + + NCCL_DEVICE_INLINE ncclLsaBarrierSession& lsaBarrier(); + NCCL_DEVICE_INLINE ncclGinBarrierSession& ginBarrier(); + + NCCL_DEVICE_INLINE void sync(Coop, cuda::memory_order, ncclGinFenceLevel); +}; +#endif + +#endif // _NCCL_DEVICE_BARRIER_H_ diff --git a/src/include/nccl_device/coop.h b/src/include/nccl_device/coop.h index 9a8d4b0a8..4af229dfb 100644 --- a/src/include/nccl_device/coop.h +++ b/src/include/nccl_device/coop.h @@ -30,7 +30,7 @@ struct ncclCoopTile { // An aligned pow2 set of threads within the warp. return (-1u>>(32-nThreadsPow2))<<(nccl::utility::lane() & -nThreadsPow2); } NCCL_DEVICE_INLINE void sync() { - __syncwarp(laneMask()); + if (nThreadsPow2 > 1) __syncwarp(laneMask()); } }; #endif @@ -43,7 +43,7 @@ typedef ncclCoopTile<32> ncclCoopWarp; #if __CUDACC__ struct ncclCoopLanes { // Some lanes of this warp. uint32_t lmask; - + NCCL_DEVICE_INLINE constexpr ncclCoopLanes(uint32_t lmask=-1u): lmask(lmask) {} NCCL_DEVICE_INLINE int thread_rank() const { @@ -71,7 +71,7 @@ struct ncclCoopWarpSpan { NCCL_DEVICE_INLINE constexpr ncclCoopWarpSpan(int warp0, int nWarps, int id): warp0(warp0), nWarps(nWarps), id(id) { } - + NCCL_DEVICE_INLINE int thread_rank() const { return threadIdx.x - 32*warp0; } @@ -100,16 +100,16 @@ struct ncclCoopCta { #if __CUDACC__ template -NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopTile coop) { +NCCL_DEVICE_INLINE uint32_t ncclCoopGetLaneMask(ncclCoopTile coop) { return coop.laneMask(); } -NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopLanes coop) { +NCCL_DEVICE_INLINE uint32_t ncclCoopGetLaneMask(ncclCoopLanes coop) { return coop.lmask; } -NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopWarpSpan coop) { +NCCL_DEVICE_INLINE uint32_t ncclCoopGetLaneMask(ncclCoopWarpSpan coop) { return -1u; } -NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopCta coop) { +NCCL_DEVICE_INLINE uint32_t ncclCoopGetLaneMask(ncclCoopCta coop) { return -1u; } #endif @@ -126,6 +126,14 @@ NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopWarpSpan) { return fa NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopCta) { return false; } #endif +#if __CUDACC__ +template +NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopTile) { return true; } +NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopLanes) { return true; } +NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopWarpSpan) { return false; } +NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopCta) { return false; } +#endif + #if __CUDACC__ // Pick threads of our warp that are safe to use collectively. NCCL_DEVICE_INLINE ncclCoopLanes ncclCoopCoalesced() { @@ -149,4 +157,55 @@ NCCL_DEVICE_INLINE ncclCoopTile ncclCoopCoalesced(ncclCoopTile +NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopTile, T value, int root, bool entrySync=true) { + constexpr int n = (sizeof(T)+4-1)/4; + union { uint32_t u[n]; T v; }; + v = value; + #pragma unroll + for (int i=0; i < n; i++) u[i] = __shfl_sync(-1u, u[i], root, nThreads); + return v; +} +template +NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopLanes coop, T value, int root, bool entrySync=true) { + uint32_t m = coop.lmask; + uint32_t r = root == 0 ? __ffs(m)-1 : __fns(m, 0, 1+root); + constexpr int n = (sizeof(T)+4-1)/4; + union { uint32_t u[n]; T v; }; + v = value; + #pragma unroll + for (int i=0; i < n; i++) u[i] = __shfl_sync(m, u[i], r); + return v; +} + +NCCL_DEVICE_INLINE ulong2* ncclCoopBcast_WarpSpan_stash() { + __shared__ ulong2 stash[15]; + return stash; +} + +template +NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopWarpSpan coop, T value, int root, bool entrySync=true) { + static_assert(sizeof(T) <= sizeof(ncclCoopBcast_WarpSpan_stash()[0]), "Required"); + if (entrySync) coop.sync(); + if (coop.thread_rank() == root) *(T*)&ncclCoopBcast_WarpSpan_stash()[coop.id] = value; + coop.sync(); + return *(T*)&ncclCoopBcast_WarpSpan_stash()[coop.id]; +} + +NCCL_DEVICE_INLINE ulong2* ncclCoopBcast_Cta_stash() { + __shared__ ulong2 stash; + return &stash; +} + +template +NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopCta coop, T value, int root, bool entrySync=true) { + static_assert(sizeof(T) <= sizeof(*ncclCoopBcast_Cta_stash()), "Required"); + if (entrySync) coop.sync(); + if (coop.thread_rank() == root) *(T*)ncclCoopBcast_Cta_stash() = value; + coop.sync(); + return *(T*)ncclCoopBcast_Cta_stash(); +} +#endif + #endif diff --git a/src/include/nccl_device/core.h b/src/include/nccl_device/core.h index dd41d6925..9b0061a72 100644 --- a/src/include/nccl_device/core.h +++ b/src/include/nccl_device/core.h @@ -24,9 +24,15 @@ typedef struct ncclMultimemHandle ncclMultimemHandle_t; typedef uint32_t ncclDevResourceHandle; typedef ncclDevResourceHandle ncclDevResourceHandle_t; +typedef uint32_t ncclGinSignal_t; +typedef uint32_t ncclGinCounter_t; + struct ncclLsaBarrierHandle; typedef struct ncclLsaBarrierHandle ncclLsaBarrierHandle_t; +struct ncclGinBarrierHandle; +typedef struct ncclGinBarrierHandle ncclGinBarrierHandle_t; + struct ncclLLA2AHandle; typedef struct ncclLLA2AHandle ncclLLA2AHandle_t; @@ -59,13 +65,26 @@ struct ncclDevCommRequirements { bool lsaMultimem; // Enable multimem on lsa team + int barrierCount; int lsaBarrierCount; + int railGinBarrierCount; + + int lsaLLA2ABlockCount, lsaLLA2ASlotCount; + + bool ginForceEnable; + int ginContextCount; // This is a hint, the actual context count in the devcomm may not match. + int ginSignalCount; // Guaranteed to start at id=0 + int ginCounterCount; // Guaranteed to start at id=0 }; struct ncclDevResourceRequirements { ncclDevResourceRequirements_t* next; size_t bufferSize, bufferAlign; ncclDevResourceHandle_t* outBufferHandle; // If non-null, target assigned during ncclDevCommCreate. + int ginSignalCount; + int ginCounterCount; + ncclGinSignal_t* outGinSignalStart; + ncclGinCounter_t* outGinCounterStart; }; struct ncclTeamRequirements { diff --git a/src/include/nccl_device/gin.h b/src/include/nccl_device/gin.h new file mode 100644 index 000000000..45623d08c --- /dev/null +++ b/src/include/nccl_device/gin.h @@ -0,0 +1,207 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_GIN_SESSION_H_ +#define _NCCL_DEVICE_GIN_SESSION_H_ +#include "core.h" +#include "gin/gin_device_common.h" + +#if __CUDACC__ +struct ncclGinCtx; // Definition in nccl_device/gin/gin_device_host_common.h +template struct ncclGinCtx_M; // ... + +struct ncclGinDescriptorSmem; // A type user allocates in __shared__ memory + +// Used as completion actions for ncclGinSession::put +struct ncclGin_None {}; + +struct ncclGin_SignalAdd { ncclGinSignal_t signal; uint64_t value; }; +// SignalInc: equivalent to SignalAdd{+1} except it may not be mixed with any +// other signal operator without intervening signal reset(). Formally: for a +// given signal, all operations between successive reset()'s of that signal must +// either all be SignalInc or all not SignalInc. +struct ncclGin_SignalInc { ncclGinSignal_t signal; }; +// Support deferred: +// struct ncclGin_SignalSet { ncclGinSignal_t signal; uint64_t value; }; +struct ncclGin_CounterInc { ncclGinCounter_t counter; }; + +struct ncclGin_DescriptorSmem { ncclGinDescriptorSmem* descriptor; }; + +template +struct ncclGin_BackendMask; + +template +using ncclGin_BackendOne = ncclGin_BackendMask<(1u<<(int)backend)>; + +using ncclGin = ncclGin_BackendMask; + +#endif + +#if __CUDACC__ +template +struct ncclGin_BackendMask { + ncclDevComm const& comm; + uint32_t nContexts:8, contextId:8, _ginBackend:8; + + // Loads GIN context into registers. Each context has one QP per peer. + NCCL_DEVICE_INLINE ncclGin_BackendMask(ncclDevComm const&, int contextIndex); + + template< + // Action to take on peer when put completes. If a signalling action is used + // then that signal will be visible only after the payload of this put as well as + // the payloads of preceding puts on this netContext to the same peer are settled. + typename RemoteAction = ncclGin_None, // one of ncclGin_{None|SignalInc|SignalAdd|SignalSet} + // Action to take locally when source has been consumed. + typename LocalAction = ncclGin_None, // one of ncclGin_{None|CounterInc} + // Set of threads participating in this put. Must be a subset of Coop. + typename Coop = ncclCoopThread, + // Optional smem descriptor space to use. Either ncclGin_{None|DescriptorSmem} + typename DescriptorSmem = ncclGin_None + > + NCCL_DEVICE_INLINE void put( + ncclTeam, int peer, + ncclWindow_t dstWnd, size_t dstOffset, + ncclWindow_t srcWnd, size_t srcOffset, size_t bytes, + RemoteAction remoteAction = ncclGin_None{}, + LocalAction localAction = ncclGin_None{}, + Coop coop = ncclCoopThread{}, + DescriptorSmem descriptor = ncclGin_None{}, + cuda::thread_scope alreadyReleased = cuda::thread_scope_thread, + cuda::thread_scope expected_scope = cuda::thread_scope_device + ) const; + + template< + typename T, + // Action to take on peer when put completes. If a signalling action is used + // then that signal will be visible only after the payload of this put as well as + // the payloads of preceding puts on this context to the same peer are settled. + typename RemoteAction = ncclGin_None, // one of ncclGin_{None|SignalInc|SignalAdd|SignalSet} + // Action to take locally when source has been consumed. + typename LocalAction = ncclGin_None, // one of ncclGin_{None|CounterInc} + // Set of threads participating in this put. Must be a subset of Coop. + typename Coop = ncclCoopThread, + // Optional smem descriptor space to use. Either ncclGin_{None|DescriptorSmem} + typename DescriptorSmem = ncclGin_None + > + NCCL_DEVICE_INLINE void put( + ncclTeam, int peer, + ncclSymPtr dstElts, ncclSymPtr srcElts, size_t nElts, + RemoteAction remoteAction = ncclGin_None{}, + LocalAction localAction = ncclGin_None{}, + Coop coop = ncclCoopThread{}, + DescriptorSmem descriptor = ncclGin_None{}, + cuda::thread_scope alreadyReleased = cuda::thread_scope_thread, + cuda::thread_scope expected_scope = cuda::thread_scope_device + ) const; + + template< + typename T, // requires sizeof(T) <= 8 + // See put() for all template arguments. + typename RemoteAction = ncclGin_None, + typename Coop = ncclCoopThread, + typename DescriptorSmem = ncclGin_None + > + NCCL_DEVICE_INLINE void putValue( + ncclTeam, int peer, + ncclWindow_t dstWnd, size_t dstOffset, T value, + RemoteAction remoteAction = ncclGin_None{}, + Coop coop = ncclCoopThread{}, + DescriptorSmem descriptor = ncclGin_None{}, + cuda::thread_scope alreadyReleased = cuda::thread_scope_thread, + cuda::thread_scope expected_scope = cuda::thread_scope_device + ) const; + + template< + typename T, // requires sizeof(T) <= 8 + // See put() for all template arguments. + typename RemoteAction = ncclGin_None, + typename Coop = ncclCoopThread, + typename DescriptorSmem = ncclGin_None + > + NCCL_DEVICE_INLINE void putValue( + ncclTeam, int peer, + ncclSymPtr dst, T value, + RemoteAction remoteAction = ncclGin_None{}, + Coop coop = ncclCoopThread{}, + DescriptorSmem descriptor = ncclGin_None{}, + cuda::thread_scope alreadyReleased = cuda::thread_scope_thread, + cuda::thread_scope expected_scope = cuda::thread_scope_device + ) const; + + template + NCCL_DEVICE_INLINE void signal( + ncclTeam, int peer, RemoteAction remoteAction, + Coop coop = ncclCoopThread(), + DescriptorSmem descriptor = ncclGin_None{}, + cuda::thread_scope alreadyReleased = cuda::thread_scope_thread, + cuda::thread_scope expected_scope = cuda::thread_scope_device + ) const; + + // All source buffers from put's from any thread in this coop will be safe to reuse. + // Flush does not guarantee that data has settled in remote memory. + template + NCCL_DEVICE_INLINE void flush(Coop, cuda::memory_order ord = cuda::memory_order_acquire) const; + + // Counter and signal wait use "rolling" comparison logic of a given bit-width + // such that unsigned overflow does not disturb the property that: x < x+1. + // + // bool rolling_less_equal(uint64_t a, uint64_t b, int bits) { + // uint64_t m = uint64_t(-1)>>(64-bits); + // return ((b-a) & m) <= (m>>1); + // } + // + // The condition waited for is that the supplied value is rolling_less_equal + // to the internal value. + // + // Counters are restricted to using a maximum of 56 bits despite that being fewer + // than a uint64_t can carry. + + NCCL_DEVICE_INLINE uint64_t readCounter(ncclGinCounter_t counter, int bits=56, cuda::memory_order ord = cuda::memory_order_acquire) const; + + template + NCCL_DEVICE_INLINE void waitCounter(Coop, ncclGinCounter_t counter, uint64_t least, int bits=56, cuda::memory_order ord = cuda::memory_order_acquire) const; + + // Each signal has a dedicated "shadow" which the user is free to manipulate for + // any reason. The only calls which manipulate the shadow are `increaseSignalShadow` + // and `resetSignal`. + NCCL_DEVICE_INLINE uint64_t* getSignalShadowPtr(ncclGinSignal_t signal) const; + NCCL_DEVICE_INLINE void increaseSignalShadow(ncclGinSignal_t signal, uint64_t delta) const; + + // Returns current value of signal with all but bottom bits set to zero. + NCCL_DEVICE_INLINE uint64_t readSignal(ncclGinSignal_t signal, int bits=64, cuda::memory_order ord = cuda::memory_order_acquire) const; + + // Wait for signal to meet or exceed value. + template + NCCL_DEVICE_INLINE void waitSignal(Coop, ncclGinSignal_t signal, uint64_t least, int bits=64, cuda::memory_order ord = cuda::memory_order_acquire) const; + + // Wait for signal to meet or exceed shadow value. + template + NCCL_DEVICE_INLINE void waitSignalMeetShadow(Coop, ncclGinSignal_t signal, int bits=64, cuda::memory_order ord = cuda::memory_order_acquire) const; + + // Wait until signal exceeds shadow by `leastDelta` (typically 1), updates shadow + // with latest value, and returns with `before` equal to previous shadow value + // and `delta` equal to difference. + template + NCCL_DEVICE_INLINE void waitSignalFollowShadow(Coop, ncclGinSignal_t signal, Uint leastDelta, Uint* before, Uint* delta, int bits=64, cuda::memory_order ord = cuda::memory_order_acquire) const; + + // Sets to zero. May not race with concurrent modifications to counter. + NCCL_DEVICE_INLINE void resetCounter(ncclGinCounter_t counter) const; + // Sets signal and shadow to zero. May not race with concurrent modifcations to signal. + NCCL_DEVICE_INLINE void resetSignal(ncclGinSignal_t signal) const; + + ////////////////////////////////////////////////////////////////////////////// + // internal: + + void* _ginHandle; + uint64_t* _signalShadows; + + NCCL_DEVICE_INLINE ncclGinCtx_M _makeCtx() const; +}; +#endif + +#endif // _NCCL_DEVICE_GIN_SESSION_H_ diff --git a/src/include/nccl_device/gin/gdaki/gin_gdaki.h b/src/include/nccl_device/gin/gdaki/gin_gdaki.h new file mode 100644 index 000000000..c14a5e292 --- /dev/null +++ b/src/include/nccl_device/gin/gdaki/gin_gdaki.h @@ -0,0 +1,214 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_GIN_GDAKI_H_ +#define _NCCL_DEVICE_GIN_GDAKI_H_ + +#ifndef DOCA_VERBS_USE_CUDA_WRAPPER +#define DOCA_VERBS_USE_CUDA_WRAPPER +#endif + +#ifndef DOCA_VERBS_USE_NET_WRAPPER +#define DOCA_VERBS_USE_NET_WRAPPER +#endif + +#ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG +#define DOCA_GPUNETIO_VERBS_ENABLE_DEBUG 1 +#endif + +#include "../gin_device_common.h" +#include "gin_gdaki_device_host_common.h" +#include "doca_gpunetio/doca_gpunetio_device.h" + +#ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG +#include +#endif + +template <> +struct ncclGinApi_Put { + template + NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, int peer, bool hasWins, + ncclGinWindow_t dstWin, size_t dstOff, ncclGinWindow_t srcWin, + size_t srcOff, size_t bytes, bool hasSignal, + ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp, + uint64_t signalOpArg, bool hasCounter, + ncclGinCounter_t counterId, bool hasDescriptor, + ncclGinDescriptorSmem* descriptor, + cuda::thread_scope required, cuda::thread_scope given) { + using nccl::utility::loadConst; + + coop.sync(); + if (coop.thread_rank() == 0) { + ncclGinGdakiGPUContext* gdaki = (struct ncclGinGdakiGPUContext*)ctx.handle; + doca_gpu_dev_verbs_qp* qp = loadConst(&gdaki->gdqp) + peer; + doca_gpu_dev_verbs_qp* companion_qp; + ncclGinGdakiMemHandle* dstMh = (ncclGinGdakiMemHandle*)dstWin; + ncclGinGdakiMemHandle* srcMh = (ncclGinGdakiMemHandle*)srcWin; + + doca_gpu_dev_verbs_addr raddr, laddr; + if (hasWins) { + raddr.addr = dstOff; + raddr.key = loadConst(loadConst(&dstMh->rkeys) + peer); + laddr.addr = srcOff, laddr.key = loadConst(&srcMh->lkey); + } + + doca_gpu_dev_verbs_addr sig_raddr, sig_laddr; + if (hasSignal) { + if (signalOp == ncclGinSignalInc) signalOpArg = 1; + sig_raddr.addr = sizeof(uint64_t) * signalId; + sig_raddr.key = loadConst(loadConst(&gdaki->signals_table.rkeys) + peer); + sig_laddr.addr = 0; + sig_laddr.key = loadConst(&gdaki->sink_buffer_lkey); + } + + doca_gpu_dev_verbs_addr counter_raddr, counter_laddr; + if (hasCounter) { + companion_qp = loadConst(&gdaki->companion_gdqp) + peer; + counter_raddr.addr = sizeof(uint64_t) * counterId; + counter_raddr.key = loadConst(loadConst(&gdaki->counters_table.rkeys) + ctx.rank); + counter_laddr.addr = 0; + counter_laddr.key = loadConst(&gdaki->sink_buffer_lkey); + } + + // cuda::thread_scope_system has the lowest value + if ((required == cuda::thread_scope_system) && (given > required)) { + doca_gpu_dev_verbs_fence_release(); + } + + if (hasWins) { + if (hasSignal && hasCounter) { + doca_gpu_dev_verbs_put_signal_counter( + qp, raddr, laddr, bytes, sig_raddr, sig_laddr, signalOpArg, companion_qp, counter_raddr, + counter_laddr, 1); + } else if (hasSignal) { + doca_gpu_dev_verbs_put_signal( + qp, raddr, laddr, bytes, sig_raddr, sig_laddr, signalOpArg); + } else if (hasCounter) { + doca_gpu_dev_verbs_put_counter(qp, raddr, laddr, bytes, companion_qp, counter_raddr, + counter_laddr, 1); + } else { + doca_gpu_dev_verbs_put(qp, raddr, laddr, bytes); + } + } else { + if (hasCounter) { + doca_gpu_dev_verbs_signal_counter( + qp, sig_raddr, sig_laddr, signalOpArg, companion_qp, counter_raddr, counter_laddr, 1); + } else { + doca_gpu_dev_verbs_signal( + qp, sig_raddr, sig_laddr, signalOpArg); + } + } + +#ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG + doca_gpu_dev_verbs_wait(qp); + if (hasCounter) doca_gpu_dev_verbs_wait(companion_qp); +#endif + } + coop.sync(); + } +}; + +template <> +struct ncclGinApi_PutValue { + template + NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, int peer, ncclGinWindow_t dstWin, + size_t dstOff, T srcVal, bool hasSignal, + ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp, + uint64_t signalOpArg, bool hasDescriptor, + ncclGinDescriptorSmem* descriptor, + cuda::thread_scope required, cuda::thread_scope given) { + using nccl::utility::loadConst; + + coop.sync(); + if (coop.thread_rank() == 0) { + ncclGinGdakiGPUContext* gdaki = (struct ncclGinGdakiGPUContext*)ctx.handle; + doca_gpu_dev_verbs_qp* qp = loadConst(&gdaki->gdqp) + peer; + ncclGinGdakiMemHandle* dstMh = (ncclGinGdakiMemHandle*)dstWin; + + doca_gpu_dev_verbs_addr raddr; + raddr.addr = dstOff; + raddr.key = loadConst(loadConst(&dstMh->rkeys) + peer); + + doca_gpu_dev_verbs_addr sig_raddr, sig_laddr; + if (hasSignal) { + if (signalOp == ncclGinSignalInc) signalOpArg = 1; + sig_raddr.addr = sizeof(uint64_t) * signalId; + sig_raddr.key = loadConst(loadConst(&gdaki->signals_table.rkeys) + peer); + sig_laddr.addr = 0; + sig_laddr.key = loadConst(&gdaki->sink_buffer_lkey); + } + + // cuda::thread_scope_system has the lowest value + if ((required == cuda::thread_scope_system) && (given > required)) { + doca_gpu_dev_verbs_fence_release(); + } + + if (hasSignal) { + doca_gpu_dev_verbs_p_signal( + qp, raddr, srcVal, sig_raddr, sig_laddr, signalOpArg); + } else { + doca_gpu_dev_verbs_p(qp, raddr, srcVal); + } + +#ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG + doca_gpu_dev_verbs_wait(qp); +#endif + } + coop.sync(); + } +}; + +template <> +struct ncclGinApi_ResetCounter { + NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, ncclGinCounter_t counterId) { + using nccl::utility::loadConst; + ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle; + loadConst(&gdaki->counters_table.buffer)[counterId] = 0; + } +}; + +template <> +struct ncclGinApi_ResetSignal { + NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, ncclGinSignal_t signalId) { + using nccl::utility::loadConst; + ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle; + loadConst(&gdaki->signals_table.buffer)[signalId] = 0; + } +}; + +template <> +struct ncclGinApi_GetCounterPtr { + NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx ctx, ncclGinCounter_t counterId) { + using nccl::utility::loadConst; + ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle; + return loadConst(&gdaki->counters_table.buffer) + counterId; + } +}; + +template <> +struct ncclGinApi_GetSignalPtr { + NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx ctx, ncclGinSignal_t signalId) { + using nccl::utility::loadConst; + ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle; + return loadConst(&gdaki->signals_table.buffer) + signalId; + } +}; + +template <> +struct ncclGinApi_Flush { + template + NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, cuda::memory_order ord) { + using nccl::utility::loadConst; + ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle; + doca_gpu_dev_verbs_qp* qps = loadConst(&gdaki->gdqp); +#pragma unroll 1 + for (int peer = coop.thread_rank(); peer < ctx.nRanks; peer += coop.size()) { + doca_gpu_dev_verbs_wait(qps + peer); + } + } +}; + +#endif /* _NCCL_DEVICE_GIN_GDAKI_H_ */ diff --git a/src/include/nccl_device/gin/gdaki/gin_gdaki_device_host_common.h b/src/include/nccl_device/gin/gdaki/gin_gdaki_device_host_common.h new file mode 100644 index 000000000..20299346f --- /dev/null +++ b/src/include/nccl_device/gin/gdaki/gin_gdaki_device_host_common.h @@ -0,0 +1,36 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_GIN_GDAKI_DEVICE_HOST_COMMON_H_ +#define _NCCL_DEVICE_GIN_GDAKI_DEVICE_HOST_COMMON_H_ + +#include + +#define NCCL_GIN_GDAKI_VERSION 100 + +template +struct ncclGinGdakiGlobalGPUBufferTable { + T *buffer; + __be32 *rkeys; + __be32 lkey; +}; + +struct ncclGinGdakiGPUContext { + struct doca_gpu_dev_verbs_qp *gdqp; + struct doca_gpu_dev_verbs_qp *companion_gdqp; + struct ncclGinGdakiGlobalGPUBufferTable counters_table; + struct ncclGinGdakiGlobalGPUBufferTable signals_table; + + // Local buffer we don't consume but is required for some operations. + __be32 sink_buffer_lkey; +}; + +struct ncclGinGdakiMemHandle { + __be32 *rkeys; + __be32 lkey; +}; + +#endif /* _NCCL_DEVICE_GIN_GDAKI_DEVICE_HOST_COMMON_H_ */ diff --git a/src/include/nccl_device/gin/gin_device_api.h b/src/include/nccl_device/gin/gin_device_api.h new file mode 100644 index 000000000..20dde3af3 --- /dev/null +++ b/src/include/nccl_device/gin/gin_device_api.h @@ -0,0 +1,18 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#ifndef _NCCL_GIN_DEVICE_API_H_ +#define _NCCL_GIN_DEVICE_API_H_ + +#include "gin_device_common.h" + +#if NCCL_GIN_GDAKI_ENABLE +#include "gdaki/gin_gdaki.h" +#endif +#if NCCL_GIN_PROXY_ENABLE +#include "proxy/gin_proxy.h" +#endif + +#endif diff --git a/src/include/nccl_device/gin/gin_device_common.h b/src/include/nccl_device/gin/gin_device_common.h new file mode 100644 index 000000000..d0d4c8fa3 --- /dev/null +++ b/src/include/nccl_device/gin/gin_device_common.h @@ -0,0 +1,120 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_GIN_DEVICE_COMMON_H_ +#define _NCCL_GIN_DEVICE_COMMON_H_ + +#include "../net_device.h" +#include "../utility.h" +#include "gin_device_host_common.h" + +#if CUDA_VERSION >= 12080 && __CUDA_ARCH__ >= 900 +#define NCCL_GIN_HAS_FENCE_ACQUIRE_RELEASE_PTX 1 +#endif + +#ifndef NCCL_GIN_PROXY_ENABLE +#define NCCL_GIN_PROXY_ENABLE 1 +#endif + +#ifndef NCCL_GIN_GDAKI_ENABLE +#if CUDA_VERSION >= 12020 && __CUDA_ARCH__ >= 700 +#define NCCL_GIN_GDAKI_ENABLE 1 +#else +#define NCCL_GIN_GDAKI_ENABLE 0 +#endif +#endif + +#define NCCL_GIN_BACKEND_MASK_ALL \ + (((NCCL_GIN_PROXY_ENABLE) ? 1u : 0u) << (unsigned)NCCL_NET_DEVICE_GIN_PROXY | \ + ((NCCL_GIN_GDAKI_ENABLE) ? 1u : 0u) << (unsigned)NCCL_NET_DEVICE_GIN_GDAKI) + +struct ncclGinCtx { + ncclNetDeviceType backend; + int rank; + int nRanks; + void* handle; +}; + +template +struct ncclGinCtx_M : ncclGinCtx {}; + +struct ncclGinDescriptorSmem { + alignas(16) char space[64]; +}; + +#if __CUDACC__ +template +struct ncclGinApi_Put { + template + NCCL_DEVICE_INLINE static void call(ncclGinCtx, Coop coop, int peer, bool hasWins, + ncclGinWindow_t dstWin, size_t dstOff, ncclGinWindow_t srcWin, + size_t srcOff, size_t bytes, bool hasSignal, + ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp, + uint64_t signalOpArg, bool hasCounter, + ncclGinCounter_t counterId, bool hasDescriptor, + ncclGinDescriptorSmem* descriptor, + cuda::thread_scope required, cuda::thread_scope given); +}; + +template +struct ncclGinApi_PutValue { + template + NCCL_DEVICE_INLINE static void call(ncclGinCtx, Coop coop, int peer, ncclGinWindow_t dstWin, + size_t dstOff, T srcData, bool hasSignal, + ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp, + uint64_t signalOpArg, bool hasDescriptor, + ncclGinDescriptorSmem* descriptor, + cuda::thread_scope required, cuda::thread_scope given); +}; + +template +struct ncclGinApi_GetSignalPtr { + NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx, int peer, ncclGinSignal_t signalId); +}; +template +struct ncclGinApi_GetCounterPtr { + NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx, int peer, ncclGinCounter_t counterId); +}; + +template +struct ncclGinApi_ResetSignal { + NCCL_DEVICE_INLINE static void call(ncclGinCtx, ncclGinSignal_t signalId); +}; + +template +struct ncclGinApi_ResetCounter { + NCCL_DEVICE_INLINE static void call(ncclGinCtx, ncclGinCounter_t counterId); +}; + +template +struct ncclGinApi_Flush { + template + NCCL_DEVICE_INLINE static void call(ncclGinCtx, Coop, cuda::memory_order ord); +}; +#endif + +#if __CUDACC__ +template